1/*
2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "libavcodec/mips/hpeldsp_mips.h"
23
24#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
25{                                                             \
26    v16u8 tmp_m;                                              \
27                                                              \
28    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
29    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
30    ST_UB(tmp_m, (pdst));                                     \
31}
32
33#define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
34{                                                                           \
35    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
36    uint8_t *pdst_m = (uint8_t *) (pdst);                                   \
37                                                                            \
38    PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7,                     \
39                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                            \
40    ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride);                 \
41}
42
43#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
44                           pdst, stride)                                \
45{                                                                       \
46    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
47    uint8_t *pdst_m = (uint8_t *) (pdst);                               \
48                                                                        \
49    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                    \
50    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \
51    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);        \
52    ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);                  \
53}
54
55static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
56                                 uint8_t *dst, int32_t dst_stride,
57                                 uint8_t height)
58{
59    uint8_t loop_cnt;
60    uint32_t out0, out1;
61    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
62    v16i8 zeros = { 0 };
63
64    for (loop_cnt = (height >> 1); loop_cnt--;) {
65        LD_UB2(src, src_stride, src0, src1);
66        src += (2 * src_stride);
67
68        SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
69        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
70
71        out0 = __msa_copy_u_w((v4i32) res0, 0);
72        out1 = __msa_copy_u_w((v4i32) res1, 0);
73        SW(out0, dst);
74        dst += dst_stride;
75        SW(out1, dst);
76        dst += dst_stride;
77    }
78}
79
80static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
81                                 uint8_t *dst, int32_t dst_stride,
82                                 uint8_t height)
83{
84    uint8_t loop_cnt;
85    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
86    v16i8 zeros = { 0 };
87
88    for (loop_cnt = (height >> 2); loop_cnt--;) {
89        LD_SB4(src, src_stride, src0, src1, src2, src3);
90        src += (4 * src_stride);
91
92        SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
93                   src0_sld1, src1_sld1, src2_sld1, src3_sld1);
94        AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
95                      src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
96        dst += (4 * dst_stride);
97    }
98}
99
100static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
101                                  uint8_t *dst, int32_t dst_stride,
102                                  uint8_t height)
103{
104    uint8_t loop_cnt;
105    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
106    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
107
108    for (loop_cnt = (height >> 3); loop_cnt--;) {
109        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
110        LD_UB8((src + 1), src_stride,
111               src8, src9, src10, src11, src12, src13, src14, src15);
112        src += (8 * src_stride);
113
114        AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
115                       dst, dst_stride);
116        dst += (4 * dst_stride);
117
118        AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
119                       dst, dst_stride);
120        dst += (4 * dst_stride);
121    }
122}
123
124static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
125                                         uint8_t *dst, int32_t dst_stride)
126{
127    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
128    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
129    v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
130    v16i8 zeros = { 0 };
131
132    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
133    src += (8 * src_stride);
134
135    SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
136               src0_sld1, src1_sld1, src2_sld1, src3_sld1);
137    SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
138               src4_sld1, src5_sld1, src6_sld1, src7_sld1);
139
140    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
141                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
142    dst += (4 * dst_stride);
143    AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
144                 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
145}
146
147static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
148                                         uint8_t *dst, int32_t dst_stride)
149{
150    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
151    v16i8 zeros = { 0 };
152
153    LD_SB4(src, src_stride, src0, src1, src2, src3);
154    SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
155               src0_sld1, src1_sld1, src2_sld1, src3_sld1);
156    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
157                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
158}
159
160static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
161                                           int32_t src_stride,
162                                           uint8_t *dst, int32_t dst_stride)
163{
164    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
165    v16u8 src9, src10, src11, src12, src13, src14, src15;
166
167    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
168    LD_UB8((src + 1), src_stride,
169           src8, src9, src10, src11, src12, src13, src14, src15);
170    src += (8 * src_stride);
171
172    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
173                  dst, dst_stride);
174    dst += (4 * dst_stride);
175
176    LD_UB4(src, src_stride, src0, src1, src2, src3);
177    LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
178    src += (4 * src_stride);
179
180    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
181                  dst, dst_stride);
182    dst += (4 * dst_stride);
183
184    LD_UB4(src, src_stride, src4, src5, src6, src7);
185    LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
186    src += (4 * src_stride);
187
188    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
189                  dst, dst_stride);
190    dst += (4 * dst_stride);
191    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
192                  dst, dst_stride);
193}
194
195static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
196                                          int32_t src_stride,
197                                          uint8_t *dst, int32_t dst_stride)
198{
199    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
200    v16u8 src9, src10, src11, src12, src13, src14, src15;
201
202    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
203    LD_UB8((src + 1), src_stride,
204           src8, src9, src10, src11, src12, src13, src14, src15);
205
206    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
207                  dst, dst_stride);
208    dst += (4 * dst_stride);
209    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
210                  dst, dst_stride);
211}
212
213static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
214                                              int32_t src_stride,
215                                              uint8_t *dst, int32_t dst_stride,
216                                              uint8_t height)
217{
218    uint8_t loop_cnt;
219    uint32_t dst0, dst1, out0, out1;
220    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
221    v16u8 tmp0 = { 0 };
222    v16u8 tmp1 = { 0 };
223    v16i8 zeros = { 0 };
224
225    for (loop_cnt = (height >> 1); loop_cnt--;) {
226        LD_UB2(src, src_stride, src0, src1);
227        src += (2 * src_stride);
228
229        SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
230
231        dst0 = LW(dst);
232        dst1 = LW(dst + dst_stride);
233        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
234        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
235
236        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
237        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
238
239        out0 = __msa_copy_u_w((v4i32) res0, 0);
240        out1 = __msa_copy_u_w((v4i32) res1, 0);
241        SW(out0, dst);
242        dst += dst_stride;
243        SW(out1, dst);
244        dst += dst_stride;
245    }
246}
247
248static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
249                                              int32_t src_stride,
250                                              uint8_t *dst, int32_t dst_stride,
251                                              uint8_t height)
252{
253    uint8_t loop_cnt;
254    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
255    v16i8 zeros = { 0 };
256
257    for (loop_cnt = (height >> 2); loop_cnt--;) {
258        LD_SB4(src, src_stride, src0, src1, src2, src3);
259        src += (4 * src_stride);
260
261        SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
262                   src0_sld1, src1_sld1, src2_sld1, src3_sld1);
263
264        AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
265                          src3, src3_sld1, dst, dst_stride);
266        dst += (4 * dst_stride);
267    }
268}
269
270static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
271                                               int32_t src_stride,
272                                               uint8_t *dst, int32_t dst_stride,
273                                               uint8_t height)
274{
275    uint8_t loop_cnt;
276    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
277    v16u8 src9, src10, src11, src12, src13, src14, src15;
278
279    for (loop_cnt = (height >> 3); loop_cnt--;) {
280        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
281        LD_UB8((src + 1), src_stride,
282               src8, src9, src10, src11, src12, src13, src14, src15);
283        src += (8 * src_stride);
284
285        AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
286                           dst, dst_stride);
287        dst += (4 * dst_stride);
288        AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
289                           dst, dst_stride);
290        dst += (4 * dst_stride);
291    }
292}
293
294static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
295                                 uint8_t *dst, int32_t dst_stride,
296                                 uint8_t height)
297{
298    uint8_t loop_cnt;
299    uint32_t out0, out1;
300    v16u8 src0, src1, src2, res0, res1;
301
302    src0 = LD_UB(src);
303    src += src_stride;
304
305    for (loop_cnt = (height >> 1); loop_cnt--;) {
306        LD_UB2(src, src_stride, src1, src2);
307        src += (2 * src_stride);
308
309        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
310
311        out0 = __msa_copy_u_w((v4i32) res0, 0);
312        out1 = __msa_copy_u_w((v4i32) res1, 0);
313        SW(out0, dst);
314        dst += dst_stride;
315        SW(out1, dst);
316        dst += dst_stride;
317
318        src0 = src2;
319    }
320}
321
322static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
323                                 uint8_t *dst, int32_t dst_stride,
324                                 uint8_t height)
325{
326    uint8_t loop_cnt;
327    v16u8 src0, src1, src2, src3, src4;
328
329    src0 = LD_UB(src);
330    src += src_stride;
331
332    for (loop_cnt = (height >> 2); loop_cnt--;) {
333        LD_UB4(src, src_stride, src1, src2, src3, src4);
334        src += (4 * src_stride);
335
336        AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
337                      dst, dst_stride);
338        dst += (4 * dst_stride);
339
340        src0 = src4;
341    }
342}
343
344static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
345                                  uint8_t *dst, int32_t dst_stride,
346                                  uint8_t height)
347{
348    uint8_t loop_cnt;
349    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
350
351    src0 = LD_UB(src);
352    src += src_stride;
353
354    for (loop_cnt = (height >> 3); loop_cnt--;) {
355        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
356        src += (8 * src_stride);
357
358        AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
359                       dst, dst_stride);
360        dst += (4 * dst_stride);
361        AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
362                       dst, dst_stride);
363        dst += (4 * dst_stride);
364
365        src0 = src8;
366    }
367}
368
369static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
370                                         uint8_t *dst, int32_t dst_stride)
371{
372    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
373
374    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
375    src += (8 * src_stride);
376    src8 = LD_UB(src);
377
378    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
379                 dst, dst_stride);
380    dst += (4 * dst_stride);
381
382    AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
383                 dst, dst_stride);
384}
385
386static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
387                                         uint8_t *dst, int32_t dst_stride)
388{
389    v16u8 src0, src1, src2, src3, src4;
390
391    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
392    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
393                 dst, dst_stride);
394}
395
396static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
397                                           int32_t src_stride,
398                                           uint8_t *dst, int32_t dst_stride)
399{
400    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
401    v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
402
403    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
404    src += (8 * src_stride);
405    LD_UB8(src, src_stride,
406           src8, src9, src10, src11, src12, src13, src14, src15);
407    src += (8 * src_stride);
408    src16 = LD_UB(src);
409
410    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
411                  dst, dst_stride);
412    dst += (4 * dst_stride);
413    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
414                  dst, dst_stride);
415    dst += (4 * dst_stride);
416    AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
417                  dst, dst_stride);
418    dst += (4 * dst_stride);
419    AVE_ST16x4_UB(src12, src13, src13, src14,
420                  src14, src15, src15, src16, dst, dst_stride);
421}
422
423static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
424                                          int32_t src_stride,
425                                          uint8_t *dst, int32_t dst_stride)
426{
427    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428
429    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
430    src += (8 * src_stride);
431    src8 = LD_UB(src);
432
433    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
434                  dst, dst_stride);
435    dst += (4 * dst_stride);
436    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
437                  dst, dst_stride);
438}
439
440static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
441                                              int32_t src_stride,
442                                              uint8_t *dst, int32_t dst_stride,
443                                              uint8_t height)
444{
445    uint8_t loop_cnt;
446    uint32_t out0, out1, dst0, dst1;
447    v16u8 src0, src1, src2;
448    v16u8 tmp0 = { 0 };
449    v16u8 tmp1 = { 0 };
450    v16u8 res0, res1;
451
452    src0 = LD_UB(src);
453    src += src_stride;
454
455    for (loop_cnt = (height >> 1); loop_cnt--;) {
456        LD_UB2(src, src_stride, src1, src2);
457        src += (2 * src_stride);
458        dst0 = LW(dst);
459        dst1 = LW(dst + dst_stride);
460        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
461        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
462        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
463        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
464        out0 = __msa_copy_u_w((v4i32) res0, 0);
465        out1 = __msa_copy_u_w((v4i32) res1, 0);
466        SW(out0, dst);
467        dst += dst_stride;
468        SW(out1, dst);
469        dst += dst_stride;
470        src0 = src2;
471    }
472}
473
474static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
475                                              int32_t src_stride,
476                                              uint8_t *dst, int32_t dst_stride,
477                                              uint8_t height)
478{
479    uint8_t loop_cnt;
480    v16u8 src0, src1, src2, src3, src4;
481
482    src0 = LD_UB(src);
483    src += src_stride;
484
485    for (loop_cnt = (height >> 2); loop_cnt--;) {
486        LD_UB4(src, src_stride, src1, src2, src3, src4);
487        src += (4 * src_stride);
488
489        AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
490                          dst, dst_stride);
491        dst += (4 * dst_stride);
492        src0 = src4;
493    }
494}
495
496static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
497                                               int32_t src_stride,
498                                               uint8_t *dst, int32_t dst_stride,
499                                               uint8_t height)
500{
501    uint8_t loop_cnt;
502    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
503    v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
504    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
505
506    src0 = LD_UB(src);
507    src += src_stride;
508
509    for (loop_cnt = (height >> 3); loop_cnt--;) {
510        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
511        src += (8 * src_stride);
512        AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
513                    res0, res1, res2, res3);
514        AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
515                    res4, res5, res6, res7);
516
517        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
518        AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
519                    res0, res1, res2, res3);
520        AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
521                    res4, res5, res6, res7);
522        ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
523        dst += (8 * dst_stride);
524
525        src0 = src8;
526    }
527}
528
529static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
530                                 uint8_t *dst, int32_t dst_stride,
531                                 uint8_t height)
532{
533    uint8_t loop_cnt;
534    uint32_t res0, res1;
535    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
536    v16u8 src0_r, src1_r, src2_r, res;
537    v8u16 add0, add1, add2, sum0, sum1;
538    v16i8 zeros = { 0 };
539
540    src0 = LD_SB(src);
541    src += src_stride;
542
543    for (loop_cnt = (height >> 1); loop_cnt--;) {
544        LD_SB2(src, src_stride, src1, src2);
545        src += (2 * src_stride);
546
547        SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
548                   src1_sld1, src2_sld1);
549        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
550                   src0_r, src1_r, src2_r);
551        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
552        ADD2(add0, add1, add1, add2, sum0, sum1);
553        SRARI_H2_UH(sum0, sum1, 2);
554        res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
555        res0 = __msa_copy_u_w((v4i32) res, 0);
556        res1 = __msa_copy_u_w((v4i32) res, 2);
557        SW(res0, dst);
558        dst += dst_stride;
559        SW(res1, dst);
560        dst += dst_stride;
561
562        src0 = src2;
563    }
564}
565
566static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
567                                 uint8_t *dst, int32_t dst_stride,
568                                 uint8_t height)
569{
570    uint8_t loop_cnt;
571    v16i8 src0, src1, src2, src3, src4;
572    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
573    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
574    v8u16 add0, add1, add2, add3, add4;
575    v8u16 sum0, sum1, sum2, sum3;
576    v16i8 zeros = { 0 };
577
578    src0 = LD_SB(src);
579    src += src_stride;
580
581    for (loop_cnt = (height >> 2); loop_cnt--;) {
582        LD_SB4(src, src_stride, src1, src2, src3, src4);
583        src += (4 * src_stride);
584
585        SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
586                   src1_sld1, src2_sld1);
587        SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
588        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
589                   src1_r, src2_r);
590        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
591        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
592        HADD_UB2_UH(src3_r, src4_r, add3, add4);
593        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
594             sum0, sum1, sum2, sum3);
595        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
596        PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
597        ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
598        dst += (4 * dst_stride);
599        src0 = src4;
600    }
601}
602
603static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
604                                  uint8_t *dst, int32_t dst_stride,
605                                  uint8_t height)
606{
607    uint8_t loop_cnt;
608    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
609    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
610    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
611    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
612    v8u16 src7_l, src8_l;
613    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
614    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
615
616    for (loop_cnt = (height >> 3); loop_cnt--;) {
617        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
618        LD_UB8((src + 1), src_stride,
619               src9, src10, src11, src12, src13, src14, src15, src16);
620        src += (8 * src_stride);
621
622        src8 = LD_UB(src);
623        src17 = LD_UB(src + 1);
624
625        ILVRL_B2_UH(src9, src0, src0_r, src0_l);
626        ILVRL_B2_UH(src10, src1, src1_r, src1_l);
627        ILVRL_B2_UH(src11, src2, src2_r, src2_l);
628        ILVRL_B2_UH(src12, src3, src3_r, src3_l);
629        ILVRL_B2_UH(src13, src4, src4_r, src4_l);
630        ILVRL_B2_UH(src14, src5, src5_r, src5_l);
631        ILVRL_B2_UH(src15, src6, src6_r, src6_l);
632        ILVRL_B2_UH(src16, src7, src7_r, src7_l);
633        ILVRL_B2_UH(src17, src8, src8_r, src8_l);
634        HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
635        HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
636        HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
637        HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
638        HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
639        HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
640        ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
641             sum0_r, sum1_r, sum2_r, sum3_r);
642        ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
643             sum4_r, sum5_r, sum6_r, sum7_r);
644        ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
645             sum0_l, sum1_l, sum2_l, sum3_l);
646        ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
647             sum4_l, sum5_l, sum6_l, sum7_l);
648        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
649        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
650        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
651        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
652        PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
653                     sum3_l, sum3_r, dst, dst_stride);
654        dst += (4 * dst_stride);
655        PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
656                     sum7_l, sum7_r, dst, dst_stride);
657        dst += (4 * dst_stride);
658    }
659}
660
661static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
662                                         uint8_t *dst, int32_t dst_stride)
663{
664    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
665    v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
666    v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
667    v8u16 src0_r, src1_r, src2_r, src3_r;
668    v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
669    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
670    v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
671    v16i8 out0, out1;
672    v16i8 zeros = { 0 };
673
674    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
675    src += (8 * src_stride);
676    src8 = LD_UB(src);
677
678    SLDI_B4_UB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
679               src0_sld1, src1_sld1, src2_sld1, src3_sld1);
680    SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
681               src5_sld1, src6_sld1);
682    SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
683    ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
684               src3, src0_r, src1_r, src2_r, src3_r);
685    ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
686               src5_r, src6_r);
687    ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
688    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
689    HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
690    HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
691
692    sum0 = add0 + add1 + 1;
693    sum1 = add1 + add2 + 1;
694    sum2 = add2 + add3 + 1;
695    sum3 = add3 + add4 + 1;
696    sum4 = add4 + add5 + 1;
697    sum5 = add5 + add6 + 1;
698    sum6 = add6 + add7 + 1;
699    sum7 = add7 + add8 + 1;
700
701    SRA_4V(sum0, sum1, sum2, sum3, 2);
702    SRA_4V(sum4, sum5, sum6, sum7, 2);
703    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
704    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
705    PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
706    ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
707}
708
709static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
710                                         uint8_t *dst, int32_t dst_stride)
711{
712    v16i8 src0, src1, src2, src3, src4;
713    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
714    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
715    v8u16 add0, add1, add2, add3, add4;
716    v8u16 sum0, sum1, sum2, sum3;
717    v16i8 out0, out1;
718    v16i8 zeros = { 0 };
719
720    LD_SB4(src, src_stride, src0, src1, src2, src3);
721    src += (4 * src_stride);
722    src4 = LD_SB(src);
723
724    SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
725               src1_sld1, src2_sld1);
726    SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
727    ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
728               src1_r, src2_r);
729    ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
730    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
731    HADD_UB2_UH(src3_r, src4_r, add3, add4);
732
733    sum0 = add0 + add1 + 1;
734    sum1 = add1 + add2 + 1;
735    sum2 = add2 + add3 + 1;
736    sum3 = add3 + add4 + 1;
737
738    SRA_4V(sum0, sum1, sum2, sum3, 2);
739    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
740    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
741}
742
743static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
744                                           int32_t src_stride,
745                                           uint8_t *dst, int32_t dst_stride)
746{
747    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
748    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
749    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
750    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
751    v8u16 src7_l, src8_l;
752    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
753    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
754
755    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
756    LD_UB8((src + 1), src_stride,
757           src9, src10, src11, src12, src13, src14, src15, src16);
758    src += (8 * src_stride);
759    src8 = LD_UB(src);
760    src17 = LD_UB(src + 1);
761
762    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
763    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
764    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
765    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
766    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
767    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
768    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
769    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
770    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
771
772    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
773    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
774    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
775    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
776    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
777    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
778
779    sum0_r = src0_r + src1_r + 1;
780    sum1_r = src1_r + src2_r + 1;
781    sum2_r = src2_r + src3_r + 1;
782    sum3_r = src3_r + src4_r + 1;
783    sum4_r = src4_r + src5_r + 1;
784    sum5_r = src5_r + src6_r + 1;
785    sum6_r = src6_r + src7_r + 1;
786    sum7_r = src7_r + src8_r + 1;
787    sum0_l = src0_l + src1_l + 1;
788    sum1_l = src1_l + src2_l + 1;
789    sum2_l = src2_l + src3_l + 1;
790    sum3_l = src3_l + src4_l + 1;
791    sum4_l = src4_l + src5_l + 1;
792    sum5_l = src5_l + src6_l + 1;
793    sum6_l = src6_l + src7_l + 1;
794    sum7_l = src7_l + src8_l + 1;
795
796    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
797    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
798    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
799    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
800    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
801                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
802    dst += (4 * dst_stride);
803
804    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
805    LD_UB8((src + 1), src_stride,
806           src9, src10, src11, src12, src13, src14, src15, src16);
807    src += (8 * src_stride);
808    src8 = LD_UB(src);
809    src17 = LD_UB(src + 1);
810
811    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
812                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
813    dst += (4 * dst_stride);
814
815    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
816    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
817    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
818    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
819    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
820    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
821    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
822    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
823    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
824
825    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
826    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
827    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
828    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
829    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
830    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
831
832    sum0_r = src0_r + src1_r + 1;
833    sum1_r = src1_r + src2_r + 1;
834    sum2_r = src2_r + src3_r + 1;
835    sum3_r = src3_r + src4_r + 1;
836    sum4_r = src4_r + src5_r + 1;
837    sum5_r = src5_r + src6_r + 1;
838    sum6_r = src6_r + src7_r + 1;
839    sum7_r = src7_r + src8_r + 1;
840    sum0_l = src0_l + src1_l + 1;
841    sum1_l = src1_l + src2_l + 1;
842    sum2_l = src2_l + src3_l + 1;
843    sum3_l = src3_l + src4_l + 1;
844    sum4_l = src4_l + src5_l + 1;
845    sum5_l = src5_l + src6_l + 1;
846    sum6_l = src6_l + src7_l + 1;
847    sum7_l = src7_l + src8_l + 1;
848
849    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
850    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
851    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
852    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
853    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
854                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
855    dst += (4 * dst_stride);
856    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
857                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
858}
859
860static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
861                                          int32_t src_stride,
862                                          uint8_t *dst, int32_t dst_stride)
863{
864    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
865    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
866    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
867    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
868    v8u16 src7_l, src8_l;
869    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
870    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
871
872    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
873    LD_UB8((src + 1), src_stride,
874           src9, src10, src11, src12, src13, src14, src15, src16);
875    src += (8 * src_stride);
876    src8 = LD_UB(src);
877    src17 = LD_UB(src + 1);
878
879    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
880    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
881    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
882    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
883    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
884    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
885    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
886    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
887    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
888
889    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
890    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
891    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
892    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
893    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
894    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
895
896    sum0_r = src0_r + src1_r + 1;
897    sum1_r = src1_r + src2_r + 1;
898    sum2_r = src2_r + src3_r + 1;
899    sum3_r = src3_r + src4_r + 1;
900    sum4_r = src4_r + src5_r + 1;
901    sum5_r = src5_r + src6_r + 1;
902    sum6_r = src6_r + src7_r + 1;
903    sum7_r = src7_r + src8_r + 1;
904    sum0_l = src0_l + src1_l + 1;
905    sum1_l = src1_l + src2_l + 1;
906    sum2_l = src2_l + src3_l + 1;
907    sum3_l = src3_l + src4_l + 1;
908    sum4_l = src4_l + src5_l + 1;
909    sum5_l = src5_l + src6_l + 1;
910    sum6_l = src6_l + src7_l + 1;
911    sum7_l = src7_l + src8_l + 1;
912
913    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
914    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
915    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
916    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
917    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
918                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
919    dst += (4 * dst_stride);
920    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
921                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
922}
923
924static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
925                                              int32_t src_stride,
926                                              uint8_t *dst, int32_t dst_stride,
927                                              uint8_t height)
928{
929    uint8_t loop_cnt;
930    uint32_t out0, out1;
931    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
932    v16u8 src0_r, src1_r, src2_r;
933    v8u16 add0, add1, add2, sum0, sum1;
934    v16u8 dst0, dst1, res0, res1;
935    v16i8 zeros = { 0 };
936
937    src0 = LD_SB(src);
938    src += src_stride;
939
940    for (loop_cnt = (height >> 1); loop_cnt--;) {
941        LD_SB2(src, src_stride, src1, src2);
942        src += (2 * src_stride);
943
944        LD_UB2(dst, dst_stride, dst0, dst1);
945        SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
946                   src1_sld1, src2_sld1);
947        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
948                   src1_r, src2_r);
949        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
950        ADD2(add0, add1, add1, add2, sum0, sum1);
951        SRARI_H2_UH(sum0, sum1, 2);
952        PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
953        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
954
955        out0 = __msa_copy_u_w((v4i32) res0, 0);
956        out1 = __msa_copy_u_w((v4i32) res1, 0);
957        SW(out0, dst);
958        dst += dst_stride;
959        SW(out1, dst);
960        dst += dst_stride;
961
962        src0 = src2;
963    }
964}
965
966static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
967                                              int32_t src_stride,
968                                              uint8_t *dst, int32_t dst_stride,
969                                              uint8_t height)
970{
971    uint8_t loop_cnt;
972    v16i8 src0, src1, src2, src3, src4;
973    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
974    v16u8 dst0, dst1, dst2, dst3;
975    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
976    v8u16 add0, add1, add2, add3, add4;
977    v8u16 sum0, sum1, sum2, sum3;
978    v16i8 zeros = { 0 };
979
980    src0 = LD_SB(src);
981    src += src_stride;
982
983    for (loop_cnt = (height >> 2); loop_cnt--;) {
984        LD_SB4(src, src_stride, src1, src2, src3, src4);
985        src += (4 * src_stride);
986
987        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
988        SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
989                   src1_sld1, src2_sld1);
990        SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
991        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
992                   src1_r, src2_r);
993        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
994        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
995        HADD_UB2_UH(src3_r, src4_r, add3, add4);
996        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
997             sum0, sum1, sum2, sum3);
998        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
999        PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
1000                           sum2, dst2, sum3, dst3, dst, dst_stride);
1001        dst += (4 * dst_stride);
1002        src0 = src4;
1003    }
1004}
1005
1006static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
1007                                               int32_t src_stride,
1008                                               uint8_t *dst, int32_t dst_stride,
1009                                               uint8_t height)
1010{
1011    uint8_t loop_cnt;
1012    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1013    v16u8 src11, src12, src13, src14, src15, src16, src17;
1014    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1015    v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
1016    v16u8 src7_l, src8_l;
1017    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1018    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
1019    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
1020    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
1021
1022    for (loop_cnt = (height >> 3); loop_cnt--;) {
1023        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1024        LD_UB8((src + 1), src_stride,
1025               src9, src10, src11, src12, src13, src14, src15, src16);
1026        src += (8 * src_stride);
1027
1028        src8 = LD_UB(src);
1029        src17 = LD_UB(src + 1);
1030
1031        ILVRL_B2_UB(src9, src0, src0_r, src0_l);
1032        ILVRL_B2_UB(src10, src1, src1_r, src1_l);
1033        ILVRL_B2_UB(src11, src2, src2_r, src2_l);
1034        ILVRL_B2_UB(src12, src3, src3_r, src3_l);
1035        ILVRL_B2_UB(src13, src4, src4_r, src4_l);
1036        ILVRL_B2_UB(src14, src5, src5_r, src5_l);
1037        ILVRL_B2_UB(src15, src6, src6_r, src6_l);
1038        ILVRL_B2_UB(src16, src7, src7_r, src7_l);
1039        ILVRL_B2_UB(src17, src8, src8_r, src8_l);
1040        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
1041        HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
1042        HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
1043        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
1044             sum2_r, sum3_r);
1045        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
1046             sum6_r, sum7_r);
1047        HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
1048        HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
1049        HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
1050        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
1051             sum2_l, sum3_l);
1052        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
1053             sum6_l, sum7_l);
1054        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
1055        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
1056        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
1057        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
1058        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1059        PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
1060        dst += dst_stride;
1061        PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
1062        dst += dst_stride;
1063        PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
1064        dst += dst_stride;
1065        PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
1066        dst += dst_stride;
1067        PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
1068        dst += dst_stride;
1069        PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
1070        dst += dst_stride;
1071        PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
1072        dst += dst_stride;
1073        PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
1074        dst += dst_stride;
1075    }
1076}
1077
1078static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
1079                            uint8_t *dst, int32_t dst_stride,
1080                            int32_t height)
1081{
1082    int32_t cnt;
1083    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1084    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1085
1086    if (0 == height % 12) {
1087        for (cnt = (height / 12); cnt--;) {
1088            LD_UB8(src, src_stride,
1089                   src0, src1, src2, src3, src4, src5, src6, src7);
1090            src += (8 * src_stride);
1091
1092            out0 = __msa_copy_u_d((v2i64) src0, 0);
1093            out1 = __msa_copy_u_d((v2i64) src1, 0);
1094            out2 = __msa_copy_u_d((v2i64) src2, 0);
1095            out3 = __msa_copy_u_d((v2i64) src3, 0);
1096            out4 = __msa_copy_u_d((v2i64) src4, 0);
1097            out5 = __msa_copy_u_d((v2i64) src5, 0);
1098            out6 = __msa_copy_u_d((v2i64) src6, 0);
1099            out7 = __msa_copy_u_d((v2i64) src7, 0);
1100
1101            SD4(out0, out1, out2, out3, dst, dst_stride);
1102            dst += (4 * dst_stride);
1103            SD4(out4, out5, out6, out7, dst, dst_stride);
1104            dst += (4 * dst_stride);
1105
1106            LD_UB4(src, src_stride, src0, src1, src2, src3);
1107            src += (4 * src_stride);
1108
1109            out0 = __msa_copy_u_d((v2i64) src0, 0);
1110            out1 = __msa_copy_u_d((v2i64) src1, 0);
1111            out2 = __msa_copy_u_d((v2i64) src2, 0);
1112            out3 = __msa_copy_u_d((v2i64) src3, 0);
1113
1114            SD4(out0, out1, out2, out3, dst, dst_stride);
1115            dst += (4 * dst_stride);
1116        }
1117    } else if (0 == height % 8) {
1118        for (cnt = height >> 3; cnt--;) {
1119            LD_UB8(src, src_stride,
1120                   src0, src1, src2, src3, src4, src5, src6, src7);
1121            src += (8 * src_stride);
1122
1123            out0 = __msa_copy_u_d((v2i64) src0, 0);
1124            out1 = __msa_copy_u_d((v2i64) src1, 0);
1125            out2 = __msa_copy_u_d((v2i64) src2, 0);
1126            out3 = __msa_copy_u_d((v2i64) src3, 0);
1127            out4 = __msa_copy_u_d((v2i64) src4, 0);
1128            out5 = __msa_copy_u_d((v2i64) src5, 0);
1129            out6 = __msa_copy_u_d((v2i64) src6, 0);
1130            out7 = __msa_copy_u_d((v2i64) src7, 0);
1131
1132            SD4(out0, out1, out2, out3, dst, dst_stride);
1133            dst += (4 * dst_stride);
1134            SD4(out4, out5, out6, out7, dst, dst_stride);
1135            dst += (4 * dst_stride);
1136        }
1137    } else if (0 == height % 4) {
1138        for (cnt = (height / 4); cnt--;) {
1139            LD_UB4(src, src_stride, src0, src1, src2, src3);
1140            src += (4 * src_stride);
1141            out0 = __msa_copy_u_d((v2i64) src0, 0);
1142            out1 = __msa_copy_u_d((v2i64) src1, 0);
1143            out2 = __msa_copy_u_d((v2i64) src2, 0);
1144            out3 = __msa_copy_u_d((v2i64) src3, 0);
1145
1146            SD4(out0, out1, out2, out3, dst, dst_stride);
1147            dst += (4 * dst_stride);
1148        }
1149    } else if (0 == height % 2) {
1150        for (cnt = (height / 2); cnt--;) {
1151            LD_UB2(src, src_stride, src0, src1);
1152            src += (2 * src_stride);
1153            out0 = __msa_copy_u_d((v2i64) src0, 0);
1154            out1 = __msa_copy_u_d((v2i64) src1, 0);
1155
1156            SD(out0, dst);
1157            dst += dst_stride;
1158            SD(out1, dst);
1159            dst += dst_stride;
1160        }
1161    }
1162}
1163
1164static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
1165                                  uint8_t *dst, int32_t dst_stride,
1166                                  int32_t height, int32_t width)
1167{
1168    int32_t cnt, loop_cnt;
1169    const uint8_t *src_tmp;
1170    uint8_t *dst_tmp;
1171    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1172
1173    for (cnt = (width >> 4); cnt--;) {
1174        src_tmp = src;
1175        dst_tmp = dst;
1176
1177        for (loop_cnt = (height >> 3); loop_cnt--;) {
1178            LD_UB8(src_tmp, src_stride,
1179                   src0, src1, src2, src3, src4, src5, src6, src7);
1180            src_tmp += (8 * src_stride);
1181
1182            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1183                   dst_tmp, dst_stride);
1184            dst_tmp += (8 * dst_stride);
1185        }
1186
1187        src += 16;
1188        dst += 16;
1189    }
1190}
1191
1192static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
1193                             uint8_t *dst, int32_t dst_stride,
1194                             int32_t height)
1195{
1196    int32_t cnt;
1197    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1198
1199    if (0 == height % 12) {
1200        for (cnt = (height / 12); cnt--;) {
1201            LD_UB8(src, src_stride,
1202                   src0, src1, src2, src3, src4, src5, src6, src7);
1203            src += (8 * src_stride);
1204            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1205                   dst, dst_stride);
1206            dst += (8 * dst_stride);
1207
1208            LD_UB4(src, src_stride, src0, src1, src2, src3);
1209            src += (4 * src_stride);
1210            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1211            dst += (4 * dst_stride);
1212        }
1213    } else if (0 == height % 8) {
1214        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
1215    } else if (0 == height % 4) {
1216        for (cnt = (height >> 2); cnt--;) {
1217            LD_UB4(src, src_stride, src0, src1, src2, src3);
1218            src += (4 * src_stride);
1219
1220            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1221            dst += (4 * dst_stride);
1222        }
1223    }
1224}
1225
1226static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
1227                           uint8_t *dst, int32_t dst_stride,
1228                           int32_t height)
1229{
1230    int32_t cnt;
1231    uint32_t out0, out1, out2, out3;
1232    v16u8 src0, src1, src2, src3;
1233    v16u8 dst0, dst1, dst2, dst3;
1234
1235    if (0 == (height % 4)) {
1236        for (cnt = (height / 4); cnt--;) {
1237            LD_UB4(src, src_stride, src0, src1, src2, src3);
1238            src += (4 * src_stride);
1239
1240            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1241
1242            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1243                        dst0, dst1, dst2, dst3);
1244
1245            out0 = __msa_copy_u_w((v4i32) dst0, 0);
1246            out1 = __msa_copy_u_w((v4i32) dst1, 0);
1247            out2 = __msa_copy_u_w((v4i32) dst2, 0);
1248            out3 = __msa_copy_u_w((v4i32) dst3, 0);
1249            SW4(out0, out1, out2, out3, dst, dst_stride);
1250            dst += (4 * dst_stride);
1251        }
1252    } else if (0 == (height % 2)) {
1253        for (cnt = (height / 2); cnt--;) {
1254            LD_UB2(src, src_stride, src0, src1);
1255            src += (2 * src_stride);
1256
1257            LD_UB2(dst, dst_stride, dst0, dst1);
1258
1259            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1260
1261            out0 = __msa_copy_u_w((v4i32) dst0, 0);
1262            out1 = __msa_copy_u_w((v4i32) dst1, 0);
1263            SW(out0, dst);
1264            dst += dst_stride;
1265            SW(out1, dst);
1266            dst += dst_stride;
1267        }
1268    }
1269}
1270
1271static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
1272                           uint8_t *dst, int32_t dst_stride,
1273                           int32_t height)
1274{
1275    int32_t cnt;
1276    uint64_t out0, out1, out2, out3;
1277    v16u8 src0, src1, src2, src3;
1278    v16u8 dst0, dst1, dst2, dst3;
1279
1280    for (cnt = (height / 4); cnt--;) {
1281        LD_UB4(src, src_stride, src0, src1, src2, src3);
1282        src += (4 * src_stride);
1283        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1284
1285        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1286                    dst0, dst1, dst2, dst3);
1287
1288        out0 = __msa_copy_u_d((v2i64) dst0, 0);
1289        out1 = __msa_copy_u_d((v2i64) dst1, 0);
1290        out2 = __msa_copy_u_d((v2i64) dst2, 0);
1291        out3 = __msa_copy_u_d((v2i64) dst3, 0);
1292        SD4(out0, out1, out2, out3, dst, dst_stride);
1293        dst += (4 * dst_stride);
1294    }
1295}
1296
1297static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
1298                            uint8_t *dst, int32_t dst_stride,
1299                            int32_t height)
1300{
1301    int32_t cnt;
1302    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1303    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1304
1305    for (cnt = (height / 8); cnt--;) {
1306        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1307        src += (8 * src_stride);
1308        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1309
1310        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1311                    dst0, dst1, dst2, dst3);
1312        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1313                    dst4, dst5, dst6, dst7);
1314        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
1315        dst += (8 * dst_stride);
1316    }
1317}
1318
1319void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1320                         ptrdiff_t line_size, int h)
1321{
1322    copy_width16_msa(pixels, line_size, block, line_size, h);
1323}
1324
1325void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1326                            ptrdiff_t line_size, int h)
1327{
1328    common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
1329}
1330
1331void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1332                            ptrdiff_t line_size, int h)
1333{
1334    common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
1335}
1336
1337void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1338                             ptrdiff_t line_size, int h)
1339{
1340    common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
1341}
1342
1343void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1344                        ptrdiff_t line_size, int h)
1345{
1346    copy_width8_msa(pixels, line_size, block, line_size, h);
1347}
1348
1349void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1350                           ptrdiff_t line_size, int h)
1351{
1352    common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
1353}
1354
1355void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1356                           ptrdiff_t line_size, int h)
1357{
1358    common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
1359}
1360
1361void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1362                            ptrdiff_t line_size, int h)
1363{
1364    common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
1365}
1366
1367void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1368                           ptrdiff_t line_size, int h)
1369{
1370    common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
1371}
1372
1373void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1374                           ptrdiff_t line_size, int h)
1375{
1376    common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
1377}
1378
1379void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1380                            ptrdiff_t line_size, int h)
1381{
1382    common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
1383}
1384
1385void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1386                                   ptrdiff_t line_size, int h)
1387{
1388    if (h == 16) {
1389        common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1390    } else if (h == 8) {
1391        common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1392    }
1393}
1394
1395void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1396                                   ptrdiff_t line_size, int h)
1397{
1398    if (h == 16) {
1399        common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1400    } else if (h == 8) {
1401        common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1402    }
1403}
1404
1405void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
1406                                    const uint8_t *pixels,
1407                                    ptrdiff_t line_size, int h)
1408{
1409    if (h == 16) {
1410        common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1411    } else if (h == 8) {
1412        common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1413    }
1414}
1415
1416void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1417                                  ptrdiff_t line_size, int h)
1418{
1419    if (h == 8) {
1420        common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1421    } else if (h == 4) {
1422        common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1423    }
1424}
1425
1426void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1427                                  ptrdiff_t line_size, int h)
1428{
1429    if (h == 8) {
1430        common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1431    } else if (h == 4) {
1432        common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1433    }
1434}
1435
1436void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1437                                   ptrdiff_t line_size, int h)
1438{
1439    if (h == 8) {
1440        common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1441    } else if (h == 4) {
1442        common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1443    }
1444}
1445
1446void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1447                         ptrdiff_t line_size, int h)
1448{
1449    avg_width16_msa(pixels, line_size, block, line_size, h);
1450}
1451
1452void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1453                            ptrdiff_t line_size, int h)
1454{
1455    common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1456}
1457
1458void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1459                            ptrdiff_t line_size, int h)
1460{
1461    common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1462}
1463
1464void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1465                             ptrdiff_t line_size, int h)
1466{
1467    common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1468}
1469
1470void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1471                        ptrdiff_t line_size, int h)
1472{
1473    avg_width8_msa(pixels, line_size, block, line_size, h);
1474}
1475
1476void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1477                           ptrdiff_t line_size, int h)
1478{
1479    common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1480}
1481
1482void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1483                           ptrdiff_t line_size, int h)
1484{
1485    common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1486}
1487
1488void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1489                            ptrdiff_t line_size, int h)
1490{
1491    common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1492}
1493
1494void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
1495                        ptrdiff_t line_size, int h)
1496{
1497    avg_width4_msa(pixels, line_size, block, line_size, h);
1498}
1499
1500void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1501                           ptrdiff_t line_size, int h)
1502{
1503    common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1504}
1505
1506void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1507                           ptrdiff_t line_size, int h)
1508{
1509    common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1510}
1511
1512void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1513                            ptrdiff_t line_size, int h)
1514{
1515    common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1516}
1517