1/*
2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "h264chroma_mips.h"
23
24static const uint8_t chroma_mask_arr[16 * 5] = {
25    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
30};
31
32static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
33                                  uint32_t coeff0, uint32_t coeff1)
34{
35    uint16_t out0, out1;
36    v16i8 src0, src1;
37    v8u16 res_r;
38    v8i16 res;
39    v16i8 mask;
40    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
43
44    mask = LD_SB(&chroma_mask_arr[0]);
45
46    LD_SB2(src, stride, src0, src1);
47
48    src0 = __msa_vshf_b(mask, src1, src0);
49    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
50    res_r <<= 3;
51    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52    res_r = __msa_sat_u_h(res_r, 7);
53    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
54
55    out0 = __msa_copy_u_h(res, 0);
56    out1 = __msa_copy_u_h(res, 2);
57
58    SH(out0, dst);
59    dst += stride;
60    SH(out1, dst);
61}
62
63static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
64                                  uint32_t coeff0, uint32_t coeff1)
65{
66    v16u8 src0, src1, src2, src3;
67    v8u16 res_r;
68    v8i16 res;
69    v16i8 mask;
70    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
73
74    mask = LD_SB(&chroma_mask_arr[64]);
75
76    LD_UB4(src, stride, src0, src1, src2, src3);
77
78    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
79
80    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
81
82    res_r = __msa_dotp_u_h(src0, coeff_vec);
83    res_r <<= 3;
84    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85    res_r = __msa_sat_u_h(res_r, 7);
86    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
87
88    ST_H4(res, 0, 1, 2, 3, dst, stride);
89}
90
91static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
92                                 uint32_t coeff0, uint32_t coeff1,
93                                 int32_t height)
94{
95    if (2 == height) {
96        avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
97    } else if (4 == height) {
98        avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
99    }
100}
101
102static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
103                                  uint32_t coeff0, uint32_t coeff1)
104{
105    v16i8 src0, src1;
106    v8u16 res_r;
107    v4i32 res;
108    v16i8 mask;
109    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
112
113    mask = LD_SB(&chroma_mask_arr[0]);
114
115    LD_SB2(src, stride, src0, src1);
116
117    src0 = __msa_vshf_b(mask, src1, src0);
118    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
119    res_r <<= 3;
120    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121    res_r = __msa_sat_u_h(res_r, 7);
122    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
123
124    ST_W2(res, 0, 1, dst, stride);
125}
126
127static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
128                                  uint32_t coeff0, uint32_t coeff1)
129{
130    v16u8 src0, src1, src2, src3, out;
131    v8u16 res0_r, res1_r;
132    v16i8 mask;
133    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
136
137    mask = LD_SB(&chroma_mask_arr[0]);
138
139    LD_UB4(src, stride, src0, src1, src2, src3);
140    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
142    res0_r <<= 3;
143    res1_r <<= 3;
144    SRARI_H2_UH(res0_r, res1_r, 6);
145    SAT_UH2_UH(res0_r, res1_r, 7);
146    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147    ST_W4(out, 0, 1, 2, 3, dst, stride);
148}
149
150static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
151                                  uint32_t coeff0, uint32_t coeff1)
152{
153    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
154    v16i8 mask;
155    v8u16 res0, res1, res2, res3;
156    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
159
160    mask = LD_SB(&chroma_mask_arr[0]);
161
162    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166    DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167    SLLI_4V(res0, res1, res2, res3, 3);
168    SRARI_H4_UH(res0, res1, res2, res3, 6);
169    SAT_UH4_UH(res0, res1, res2, res3, 7);
170    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
171    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
172}
173
174static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
175                                 uint32_t coeff0, uint32_t coeff1,
176                                 int32_t height)
177{
178    if (2 == height) {
179        avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
180    } else if (4 == height) {
181        avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
182    } else if (8 == height) {
183        avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
184    }
185}
186
187static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
188                                  uint32_t coeff0, uint32_t coeff1)
189{
190    v16u8 src0, src1, src2, src3, out0, out1;
191    v8u16 res0, res1, res2, res3;
192    v16i8 mask;
193    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
196
197    mask = LD_SB(&chroma_mask_arr[32]);
198    LD_UB4(src, stride, src0, src1, src2, src3);
199    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
200    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202                coeff_vec, res0, res1, res2, res3);
203    SLLI_4V(res0, res1, res2, res3, 3);
204    SRARI_H4_UH(res0, res1, res2, res3, 6);
205    SAT_UH4_UH(res0, res1, res2, res3, 7);
206    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
207    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
208}
209
210static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
211                                  uint32_t coeff0, uint32_t coeff1)
212{
213    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
214    v16u8 out0, out1, out2, out3;
215    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
216    v16i8 mask;
217    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
220
221    mask = LD_SB(&chroma_mask_arr[32]);
222
223    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
224    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
225    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229                coeff_vec, res0, res1, res2, res3);
230    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231                coeff_vec, res4, res5, res6, res7);
232    SLLI_4V(res0, res1, res2, res3, 3);
233    SLLI_4V(res4, res5, res6, res7, 3);
234    SRARI_H4_UH(res0, res1, res2, res3, 6);
235    SRARI_H4_UH(res4, res5, res6, res7, 6);
236    SAT_UH4_UH(res0, res1, res2, res3, 7);
237    SAT_UH4_UH(res4, res5, res6, res7, 7);
238    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
239    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
240    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
241}
242
243static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
244                                      int32_t stride, uint32_t coeff0,
245                                      uint32_t coeff1, int32_t height)
246{
247    uint32_t row;
248    v16u8 src0, src1, src2, src3, out0, out1;
249    v8u16 res0, res1, res2, res3;
250    v16i8 mask;
251    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
254
255    mask = LD_SB(&chroma_mask_arr[32]);
256
257    for (row = height >> 2; row--;) {
258        LD_UB4(src, stride, src0, src1, src2, src3);
259        src += (4 * stride);
260
261        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
262        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264                    coeff_vec, res0, res1, res2, res3);
265        SLLI_4V(res0, res1, res2, res3, 3);
266        SRARI_H4_UH(res0, res1, res2, res3, 6);
267        SAT_UH4_UH(res0, res1, res2, res3, 7);
268        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
269        ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
270        dst += (4 * stride);
271    }
272
273    if (0 != (height % 4)) {
274        for (row = (height % 4); row--;) {
275            src0 = LD_UB(src);
276            src += stride;
277
278            src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
279
280            res0 = __msa_dotp_u_h(src0, coeff_vec);
281            res0 <<= 3;
282            res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283            res0 = __msa_sat_u_h(res0, 7);
284            res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
285
286            ST_D1(res0, 0, dst);
287            dst += stride;
288        }
289    }
290}
291
292static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
293                                 uint32_t coeff0, uint32_t coeff1,
294                                 int32_t height)
295{
296    if (4 == height) {
297        avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
298    } else if (8 == height) {
299        avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
300    } else {
301        avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
302    }
303}
304
305static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
306                                  uint32_t coeff0, uint32_t coeff1)
307{
308    uint16_t out0, out1;
309    v16i8 src0, src1, src2;
310    v16u8 tmp0, tmp1;
311    v8i16 res;
312    v8u16 res_r;
313    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
316
317    LD_SB3(src, stride, src0, src1, src2);
318
319    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
320
321    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
322
323    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
324    res_r <<= 3;
325    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326    res_r = __msa_sat_u_h(res_r, 7);
327    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
328
329    out0 = __msa_copy_u_h(res, 0);
330    out1 = __msa_copy_u_h(res, 2);
331
332    SH(out0, dst);
333    dst += stride;
334    SH(out1, dst);
335}
336
337static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
338                                  uint32_t coeff0, uint32_t coeff1)
339{
340    v16u8 src0, src1, src2, src3, src4;
341    v16u8 tmp0, tmp1, tmp2, tmp3;
342    v8i16 res;
343    v8u16 res_r;
344    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
347
348    LD_UB5(src, stride, src0, src1, src2, src3, src4);
349    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350               tmp0, tmp1, tmp2, tmp3);
351    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
352
353    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
354
355    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
356    res_r <<= 3;
357    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358    res_r = __msa_sat_u_h(res_r, 7);
359
360    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
361
362    ST_H4(res, 0, 1, 2, 3, dst, stride);
363}
364
365static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
366                                 uint32_t coeff0, uint32_t coeff1,
367                                 int32_t height)
368{
369    if (2 == height) {
370        avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
371    } else if (4 == height) {
372        avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
373    }
374}
375
376static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
377                                  uint32_t coeff0, uint32_t coeff1)
378{
379    v16u8 src0, src1, src2;
380    v16u8 tmp0, tmp1;
381    v4i32 res;
382    v8u16 res_r;
383    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
386
387    LD_UB3(src, stride, src0, src1, src2);
388    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
389
390    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
392    res_r <<= 3;
393    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394    res_r = __msa_sat_u_h(res_r, 7);
395    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
396
397    ST_W2(res, 0, 1, dst, stride);
398}
399
400static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
401                                  uint32_t coeff0, uint32_t coeff1)
402{
403    v16u8 src0, src1, src2, src3, src4;
404    v16u8 tmp0, tmp1, tmp2, tmp3;
405    v16u8 out;
406    v8u16 res0_r, res1_r;
407    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
410
411    LD_UB5(src, stride, src0, src1, src2, src3, src4);
412    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
413               tmp3);
414    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
416    res0_r <<= 3;
417    res1_r <<= 3;
418    SRARI_H2_UH(res0_r, res1_r, 6);
419    SAT_UH2_UH(res0_r, res1_r, 7);
420    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421    ST_W4(out, 0, 1, 2, 3, dst, stride);
422}
423
424static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
425                                  uint32_t coeff0, uint32_t coeff1)
426{
427    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428    v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429    v8u16 res0, res1, res2, res3;
430    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
433
434    LD_UB5(src, stride, src0, src1, src2, src3, src4);
435    src += (5 * stride);
436    LD_UB4(src, stride, src5, src6, src7, src8);
437    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
438               tmp3);
439    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
440               tmp7);
441    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442    ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444    DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445    SLLI_4V(res0, res1, res2, res3, 3);
446    SRARI_H4_UH(res0, res1, res2, res3, 6);
447    SAT_UH4_UH(res0, res1, res2, res3, 7);
448    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
449    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
450}
451
452static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
453                                 uint32_t coeff0, uint32_t coeff1,
454                                 int32_t height)
455{
456    if (2 == height) {
457        avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
458    } else if (4 == height) {
459        avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
460    } else if (8 == height) {
461        avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
462    }
463}
464
465static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
466                                  uint32_t coeff0, uint32_t coeff1)
467{
468    v16u8 src0, src1, src2, src3, src4, out0, out1;
469    v8u16 res0, res1, res2, res3;
470    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
473
474    LD_UB5(src, stride, src0, src1, src2, src3, src4);
475    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
476               src3);
477    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478                coeff_vec, res0, res1, res2, res3);
479    SLLI_4V(res0, res1, res2, res3, 3);
480    SRARI_H4_UH(res0, res1, res2, res3, 6);
481    SAT_UH4_UH(res0, res1, res2, res3, 7);
482    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
483    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
484}
485
486static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
487                                  uint32_t coeff0, uint32_t coeff1)
488{
489    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
490    v16u8 out0, out1, out2, out3;
491    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
495
496    LD_UB5(src, stride, src0, src1, src2, src3, src4);
497    src += (5 * stride);
498    LD_UB4(src, stride, src5, src6, src7, src8);
499    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
500               src3);
501    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
502               src7);
503    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504                coeff_vec, res0, res1, res2, res3);
505    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506                coeff_vec, res4, res5, res6, res7);
507    SLLI_4V(res0, res1, res2, res3, 3);
508    SLLI_4V(res4, res5, res6, res7, 3);
509    SRARI_H4_UH(res0, res1, res2, res3, 6);
510    SRARI_H4_UH(res4, res5, res6, res7, 6);
511    SAT_UH4_UH(res0, res1, res2, res3, 7);
512    SAT_UH4_UH(res0, res1, res2, res3, 7);
513    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
514    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
515    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
516}
517
518static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
519                                 uint32_t coeff0, uint32_t coeff1,
520                                 int32_t height)
521{
522    if (4 == height) {
523        avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
524    } else if (8 == height) {
525        avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
526    }
527}
528
529static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
530                                  uint32_t coef_hor0, uint32_t coef_hor1,
531                                  uint32_t coef_ver0, uint32_t coef_ver1)
532{
533    uint16_t out0, out1;
534    v16u8 src0, src1, src2;
535    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
536    v8i16 res_vert;
537    v16i8 mask;
538    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
543
544    mask = LD_SB(&chroma_mask_arr[48]);
545
546    LD_UB3(src, stride, src0, src1, src2);
547    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
548    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
549    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
550
551    res_vt0 += res_vt1;
552    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553    res_vt0 = __msa_sat_u_h(res_vt0, 7);
554    res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
555
556    out0 = __msa_copy_u_h(res_vert, 0);
557    out1 = __msa_copy_u_h(res_vert, 1);
558
559    SH(out0, dst);
560    dst += stride;
561    SH(out1, dst);
562}
563
564static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
565                                  uint32_t coef_hor0, uint32_t coef_hor1,
566                                  uint32_t coef_ver0, uint32_t coef_ver1)
567{
568    v16u8 src0, src1, src2, src3, src4;
569    v16u8 tmp0, tmp1, tmp2, tmp3;
570    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
571    v8i16 res;
572    v16i8 mask;
573    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
578
579    mask = LD_SB(&chroma_mask_arr[48]);
580
581    LD_UB5(src, stride, src0, src1, src2, src3, src4);
582
583    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
584    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
585    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
586    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
587    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
588
589    res_vt0 += res_vt1;
590    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591    res_vt0 = __msa_sat_u_h(res_vt0, 7);
592
593    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
594
595    ST_H4(res, 0, 1, 2, 3, dst, stride);
596}
597
598static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
599                                 uint32_t coef_hor0, uint32_t coef_hor1,
600                                 uint32_t coef_ver0, uint32_t coef_ver1,
601                                 int32_t height)
602{
603    if (2 == height) {
604        avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
605                              coef_ver1);
606    } else if (4 == height) {
607        avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
608                              coef_ver1);
609    }
610}
611
612static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
613                                  uint32_t coef_hor0, uint32_t coef_hor1,
614                                  uint32_t coef_ver0, uint32_t coef_ver1)
615{
616    v16u8 src0, src1, src2;
617    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
618    v16i8 mask;
619    v4i32 res;
620    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
625
626    mask = LD_SB(&chroma_mask_arr[0]);
627    LD_UB3(src, stride, src0, src1, src2);
628    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
629    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
630    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
631
632    res_vt0 += res_vt1;
633    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634    res_vt0 = __msa_sat_u_h(res_vt0, 7);
635    res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
636
637    ST_W2(res, 0, 1, dst, stride);
638}
639
640static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
641                                  uint32_t coef_hor0, uint32_t coef_hor1,
642                                  uint32_t coef_ver0, uint32_t coef_ver1)
643{
644    v16u8 src0, src1, src2, src3, src4;
645    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
647    v16i8 mask;
648    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
653    v4i32 res0, res1;
654
655    mask = LD_SB(&chroma_mask_arr[0]);
656
657    LD_UB5(src, stride, src0, src1, src2, src3, src4);
658    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
659    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
660    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
661                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
662                res_hz3);
663    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
666    SRARI_H2_UH(res_vt0, res_vt1, 6);
667    SAT_UH2_UH(res_vt0, res_vt1, 7);
668    PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
669    ST_W2(res0, 0, 1, dst, stride);
670    ST_W2(res1, 0, 1, dst + 2 * stride, stride);
671}
672
673static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
674                                  uint32_t coef_hor0, uint32_t coef_hor1,
675                                  uint32_t coef_ver0, uint32_t coef_ver1)
676{
677    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
678    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
679    v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
680    v16i8 mask;
681    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
682    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
683    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
684    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
685    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
686
687    mask = LD_SB(&chroma_mask_arr[0]);
688
689    LD_UB5(src, stride, src0, src1, src2, src3, src4);
690    src += (5 * stride);
691    LD_UB4(src, stride, src5, src6, src7, src8);
692
693    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
694    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
695    VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
696    VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
697    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
698                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
699    DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
700                coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
701    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
702         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
703    MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
704         res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
705    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
706    ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
707    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
708    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
709    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
710    ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
711}
712
713static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
714                                 uint32_t coef_hor0, uint32_t coef_hor1,
715                                 uint32_t coef_ver0, uint32_t coef_ver1,
716                                 int32_t height)
717{
718    if (2 == height) {
719        avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
720                              coef_ver1);
721    } else if (4 == height) {
722        avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
723                              coef_ver1);
724    } else if (8 == height) {
725        avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
726                              coef_ver1);
727    }
728}
729
730static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
731                                  uint32_t coef_hor0, uint32_t coef_hor1,
732                                  uint32_t coef_ver0, uint32_t coef_ver1)
733{
734    v16u8 src0, src1, src2, src3, src4, out0, out1;
735    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
736    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
737    v16i8 mask;
738    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
739    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
740    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
741    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
742    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
743
744    mask = LD_SB(&chroma_mask_arr[32]);
745
746    src0 = LD_UB(src);
747    src += stride;
748
749    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
750    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
751
752    LD_UB4(src, stride, src1, src2, src3, src4);
753    src += (4 * stride);
754
755    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
756    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
757    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
758                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
759    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
760         res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
761
762    res_vt0 += (res_hz0 * coeff_vt_vec1);
763    res_vt1 += (res_hz1 * coeff_vt_vec1);
764    res_vt2 += (res_hz2 * coeff_vt_vec1);
765    res_vt3 += (res_hz3 * coeff_vt_vec1);
766
767    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
768    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
769    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
770    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
771}
772
773static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
774                                  uint32_t coef_hor0, uint32_t coef_hor1,
775                                  uint32_t coef_ver0, uint32_t coef_ver1)
776{
777    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
778    v16u8 out0, out1, out2, out3;
779    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
780    v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
781    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
782    v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
783    v16i8 mask;
784    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
785    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
786    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
787    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
788    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
789
790    mask = LD_SB(&chroma_mask_arr[32]);
791
792    LD_UB5(src, stride, src0, src1, src2, src3, src4);
793    src += (5 * stride);
794    LD_UB4(src, stride, src5, src6, src7, src8);
795    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
796    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
797    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
798    VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
799    VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
800    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
801    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
802                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
803                res_hz4);
804    DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
805                coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
806    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
807         coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
808         res_vt3);
809    MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
810         coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
811         res_vt7);
812    res_vt0 += (res_hz0 * coeff_vt_vec1);
813    res_vt1 += (res_hz1 * coeff_vt_vec1);
814    res_vt2 += (res_hz2 * coeff_vt_vec1);
815    res_vt3 += (res_hz3 * coeff_vt_vec1);
816    res_vt4 += (res_hz4 * coeff_vt_vec1);
817    res_vt5 += (res_hz5 * coeff_vt_vec1);
818    res_vt6 += (res_hz6 * coeff_vt_vec1);
819    res_vt7 += (res_hz7 * coeff_vt_vec1);
820    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
821    SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
822    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
823    SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
824    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
825    PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
826    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
827}
828
829static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
830                                 uint32_t coef_hor0, uint32_t coef_hor1,
831                                 uint32_t coef_ver0, uint32_t coef_ver1,
832                                 int32_t height)
833{
834    if (4 == height) {
835        avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
836                              coef_ver1);
837    } else if (8 == height) {
838        avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
839                              coef_ver1);
840    }
841}
842
843static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
844                                               int32_t stride, uint32_t coeff0,
845                                               uint32_t coeff1)
846{
847    uint16_t out0, out1;
848    v16i8 src0, src1;
849    v16u8 dst_data = { 0 };
850    v8u16 res_r;
851    v16u8 res;
852    v16i8 mask;
853    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
854    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
855    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
856
857    mask = LD_SB(&chroma_mask_arr[0]);
858
859    LD_SB2(src, stride, src0, src1);
860
861    out0 = LH(dst);
862    out1 = LH(dst + stride);
863
864    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
865    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
866
867    src0 = __msa_vshf_b(mask, src1, src0);
868
869    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
870    res_r <<= 3;
871    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
872    res_r = __msa_sat_u_h(res_r, 7);
873
874    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
875    dst_data = __msa_aver_u_b(res, dst_data);
876
877    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
878    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
879
880    SH(out0, dst);
881    dst += stride;
882    SH(out1, dst);
883}
884
885static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
886                                               int32_t stride, uint32_t coeff0,
887                                               uint32_t coeff1)
888{
889    uint16_t tp0, tp1, tp2, tp3;
890    v16u8 src0, src1, src2, src3;
891    v16u8 dst0, dst_data = { 0 };
892    v8u16 res_r;
893    v16i8 mask;
894    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
895    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
896    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
897
898    mask = LD_SB(&chroma_mask_arr[64]);
899
900    LD_UB4(src, stride, src0, src1, src2, src3);
901    tp0 = LH(dst);
902    tp1 = LH(dst + stride);
903    tp2 = LH(dst + 2 * stride);
904    tp3 = LH(dst + 3 * stride);
905    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
906    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
907    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
908    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
909
910    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
911
912    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
913
914    res_r = __msa_dotp_u_h(src0, coeff_vec);
915    res_r <<= 3;
916    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
917    res_r = __msa_sat_u_h(res_r, 7);
918
919    dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
920    dst0 = __msa_aver_u_b(dst0, dst_data);
921
922    ST_H4(dst0, 0, 1, 2, 3, dst, stride);
923}
924
925static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
926                                              int32_t stride, uint32_t coeff0,
927                                              uint32_t coeff1, int32_t height)
928{
929    if (2 == height) {
930        avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
931    } else if (4 == height) {
932        avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
933    }
934}
935
936static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
937                                               int32_t stride, uint32_t coeff0,
938                                               uint32_t coeff1)
939{
940    uint32_t load0, load1;
941    v16i8 src0, src1;
942    v16u8 dst_data = { 0 };
943    v8u16 res_r;
944    v16i8 res, mask;
945    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
946    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
947    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
948
949    mask = LD_SB(&chroma_mask_arr[0]);
950
951    LD_SB2(src, stride, src0, src1);
952
953    LW2(dst, stride, load0, load1);
954
955    INSERT_W2_UB(load0, load1, dst_data);
956
957    src0 = __msa_vshf_b(mask, src1, src0);
958
959    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
960    res_r <<= 3;
961    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
962    res_r = __msa_sat_u_h(res_r, 7);
963    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
964    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
965
966    ST_W2(dst_data, 0, 1, dst, stride);
967}
968
969static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
970                                               int32_t stride, uint32_t coeff0,
971                                               uint32_t coeff1)
972{
973    uint32_t tp0, tp1, tp2, tp3;
974    v16u8 src0, src1, src2, src3;
975    v16u8 out, dst_data = { 0 };
976    v16i8 mask;
977    v8u16 res0_r, res1_r;
978    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
979    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
980    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
981
982    mask = LD_SB(&chroma_mask_arr[0]);
983
984    LD_UB4(src, stride, src0, src1, src2, src3);
985    LW4(dst, stride, tp0, tp1, tp2, tp3);
986    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
987    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
988    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
989    res0_r <<= 3;
990    res1_r <<= 3;
991    SRARI_H2_UH(res0_r, res1_r, 6);
992    SAT_UH2_UH(res0_r, res1_r, 7);
993    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
994    out = __msa_aver_u_b(out, dst_data);
995    ST_W4(out, 0, 1, 2, 3, dst, stride);
996}
997
998static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
999                                               int32_t stride, uint32_t coeff0,
1000                                               uint32_t coeff1)
1001{
1002    uint32_t tp0, tp1, tp2, tp3;
1003    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
1004    v16u8 dst0 = { 0 }, dst1 = { 0 };
1005    v16i8 mask;
1006    v8u16 res0, res1, res2, res3;
1007    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1008    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1009    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1010
1011    mask = LD_SB(&chroma_mask_arr[0]);
1012
1013    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1014    LW4(dst, stride, tp0, tp1, tp2, tp3);
1015    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1016    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1017    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1018    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1019    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
1020    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
1021    DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
1022    SLLI_4V(res0, res1, res2, res3, 3);
1023    SRARI_H4_UH(res0, res1, res2, res3, 6);
1024    SAT_UH4_UH(res0, res1, res2, res3, 7);
1025    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1026    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1027    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1028}
1029
1030static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1031                                              int32_t stride, uint32_t coeff0,
1032                                              uint32_t coeff1, int32_t height)
1033{
1034    if (2 == height) {
1035        avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1036    } else if (4 == height) {
1037        avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1038    } else if (8 == height) {
1039        avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1040    }
1041}
1042
1043static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1044                                               int32_t stride, uint32_t coeff0,
1045                                               uint32_t coeff1)
1046{
1047    uint64_t tp0, tp1, tp2, tp3;
1048    v16u8 src0, src1, src2, src3, out0, out1;
1049    v16u8 dst0 = { 0 }, dst1 = { 0 };
1050    v8u16 res0, res1, res2, res3;
1051    v16i8 mask;
1052    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1053    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1054    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1055
1056    mask = LD_SB(&chroma_mask_arr[32]);
1057    LD_UB4(src, stride, src0, src1, src2, src3);
1058    LD4(dst, stride, tp0, tp1, tp2, tp3);
1059    INSERT_D2_UB(tp0, tp1, dst0);
1060    INSERT_D2_UB(tp2, tp3, dst1);
1061    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1062    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1063    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1064                coeff_vec, res0, res1, res2, res3);
1065    SLLI_4V(res0, res1, res2, res3, 3);
1066    SRARI_H4_UH(res0, res1, res2, res3, 6);
1067    SAT_UH4_UH(res0, res1, res2, res3, 7);
1068    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1069    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
1070    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1071}
1072
1073static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1074                                               int32_t stride, uint32_t coeff0,
1075                                               uint32_t coeff1)
1076{
1077    uint64_t tp0, tp1, tp2, tp3;
1078    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1079    v16u8 out0, out1, out2, out3;
1080    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1081    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1082    v16i8 mask;
1083    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1084    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1085    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1086
1087    mask = LD_SB(&chroma_mask_arr[32]);
1088
1089    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1090    LD4(dst, stride, tp0, tp1, tp2, tp3);
1091    INSERT_D2_UB(tp0, tp1, dst0);
1092    INSERT_D2_UB(tp2, tp3, dst1);
1093    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1094    INSERT_D2_UB(tp0, tp1, dst2);
1095    INSERT_D2_UB(tp2, tp3, dst3);
1096    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1097    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1098    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
1099    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
1100    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1101                coeff_vec, res0, res1, res2, res3);
1102    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1103                coeff_vec, res4, res5, res6, res7);
1104    SLLI_4V(res0, res1, res2, res3, 3);
1105    SLLI_4V(res4, res5, res6, res7, 3);
1106    SRARI_H4_UH(res0, res1, res2, res3, 6);
1107    SRARI_H4_UH(res4, res5, res6, res7, 6);
1108    SAT_UH4_UH(res0, res1, res2, res3, 7);
1109    SAT_UH4_UH(res4, res5, res6, res7, 7);
1110    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1111    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1112    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1113    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1114    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1115}
1116
1117static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1118                                              int32_t stride, uint32_t coeff0,
1119                                              uint32_t coeff1, int32_t height)
1120{
1121    if (4 == height) {
1122        avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1123    } else if (8 == height) {
1124        avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1125    }
1126}
1127
1128static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
1129                                               int32_t stride, uint32_t coeff0,
1130                                               uint32_t coeff1)
1131{
1132    uint16_t out0, out1;
1133    v16i8 src0, src1, src2, tmp0, tmp1, res;
1134    v16u8 dst_data = { 0 };
1135    v8i16 out;
1136    v8u16 res_r;
1137    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1138    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1139    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1140
1141    LD_SB3(src, stride, src0, src1, src2);
1142    out0 = LH(dst);
1143    out1 = LH(dst + stride);
1144
1145    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
1146    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
1147
1148    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1149
1150    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1151    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1152    res_r <<= 3;
1153    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1154    res_r = __msa_sat_u_h(res_r, 7);
1155    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1156    out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1157    out0 = __msa_copy_u_h(out, 0);
1158    out1 = __msa_copy_u_h(out, 2);
1159
1160    SH(out0, dst);
1161    dst += stride;
1162    SH(out1, dst);
1163}
1164
1165static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
1166                                               int32_t stride, uint32_t coeff0,
1167                                               uint32_t coeff1)
1168{
1169    uint16_t tp0, tp1, tp2, tp3;
1170    v16i8 src0, src1, src2, src3, src4;
1171    v16u8 tmp0, tmp1, tmp2, tmp3;
1172    v8u16 res_r;
1173    v8i16 res;
1174    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1175    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1176    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1177    v16u8 dst_data = { 0 };
1178
1179    LD_SB5(src, stride, src0, src1, src2, src3, src4);
1180
1181    tp0 = LH(dst);
1182    tp1 = LH(dst + stride);
1183    tp2 = LH(dst + 2 * stride);
1184    tp3 = LH(dst + 3 * stride);
1185    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
1186    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
1187    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
1188    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
1189
1190    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1191               tmp0, tmp1, tmp2, tmp3);
1192    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1193
1194    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1195
1196    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1197    res_r <<= 3;
1198    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1199    res_r = __msa_sat_u_h(res_r, 7);
1200
1201    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1202    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1203
1204    ST_H4(res, 0, 1, 2, 3, dst, stride);
1205}
1206
1207static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
1208                                              int32_t stride, uint32_t coeff0,
1209                                              uint32_t coeff1, int32_t height)
1210{
1211    if (2 == height) {
1212        avc_chroma_vt_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
1213    } else if (4 == height) {
1214        avc_chroma_vt_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
1215    }
1216}
1217
1218static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
1219                                               int32_t stride, uint32_t coeff0,
1220                                               uint32_t coeff1)
1221{
1222    uint32_t load0, load1;
1223    v16u8 src0, src1, src2, tmp0, tmp1;
1224    v16u8 dst_data = { 0 };
1225    v8u16 res_r;
1226    v16u8 res;
1227    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1228    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1229    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1230
1231    LD_UB3(src, stride, src0, src1, src2);
1232
1233    LW2(dst, stride, load0, load1);
1234
1235    INSERT_W2_UB(load0, load1, dst_data);
1236    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
1237
1238    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1239
1240    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1241    res_r <<= 3;
1242    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1243    res_r = __msa_sat_u_h(res_r, 7);
1244    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1245    res = __msa_aver_u_b(res, dst_data);
1246
1247    ST_W2(res, 0, 1, dst, stride);
1248}
1249
1250static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
1251                                               int32_t stride, uint32_t coeff0,
1252                                               uint32_t coeff1)
1253{
1254    uint32_t tp0, tp1, tp2, tp3;
1255    v16u8 src0, src1, src2, src3, src4;
1256    v16u8 tmp0, tmp1, tmp2, tmp3;
1257    v16u8 dst0 = { 0 };
1258    v8u16 res0_r, res1_r;
1259    v16u8 out;
1260    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1261    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1262    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1263
1264    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1265    LW4(dst, stride, tp0, tp1, tp2, tp3);
1266    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1267    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1268               tmp3);
1269    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1270    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1271    res0_r <<= 3;
1272    res1_r <<= 3;
1273    SRARI_H2_UH(res0_r, res1_r, 6);
1274    SAT_UH2_UH(res0_r, res1_r, 7);
1275    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
1276    out = __msa_aver_u_b(out, dst0);
1277    ST_W4(out, 0, 1, 2, 3, dst, stride);
1278}
1279
1280static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
1281                                               int32_t stride, uint32_t coeff0,
1282                                               uint32_t coeff1)
1283{
1284    uint32_t tp0, tp1, tp2, tp3;
1285    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1286    v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
1287    v16u8 dst0 = { 0 }, dst1 = { 0 };
1288    v8u16 res0, res1, res2, res3;
1289    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1290    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1291    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1292
1293    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1294    src += (5 * stride);
1295    LD_UB4(src, stride, src5, src6, src7, src8);
1296    LW4(dst, stride, tp0, tp1, tp2, tp3);
1297    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1298    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1299    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1300    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1301               tmp3);
1302    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
1303               tmp7);
1304    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1305    ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
1306    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
1307    DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
1308    SLLI_4V(res0, res1, res2, res3, 3);
1309    SRARI_H4_UH(res0, res1, res2, res3, 6);
1310    SAT_UH4_UH(res0, res1, res2, res3, 7);
1311    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1312    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1313    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1314}
1315
1316static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1317                                              int32_t stride, uint32_t coeff0,
1318                                              uint32_t coeff1, int32_t height)
1319{
1320    if (2 == height) {
1321        avc_chroma_vt_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1322    } else if (4 == height) {
1323        avc_chroma_vt_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1324    } else if (8 == height) {
1325        avc_chroma_vt_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1326    }
1327}
1328
1329static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1330                                               int32_t stride, uint32_t coeff0,
1331                                               uint32_t coeff1)
1332{
1333    uint64_t tp0, tp1, tp2, tp3;
1334    v16u8 src0, src1, src2, src3, src4;
1335    v16u8 out0, out1;
1336    v8u16 res0, res1, res2, res3;
1337    v16u8 dst0 = { 0 }, dst1 = { 0 };
1338    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1339    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1340    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1341
1342    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1343    LD4(dst, stride, tp0, tp1, tp2, tp3);
1344    INSERT_D2_UB(tp0, tp1, dst0);
1345    INSERT_D2_UB(tp2, tp3, dst1);
1346    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1347               src0, src1, src2, src3);
1348    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1349                coeff_vec, res0, res1, res2, res3);
1350    SLLI_4V(res0, res1, res2, res3, 3);
1351    SRARI_H4_UH(res0, res1, res2, res3, 6);
1352    SAT_UH4_UH(res0, res1, res2, res3, 7);
1353    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1354    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1355    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1356}
1357
1358static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1359                                               int32_t stride, uint32_t coeff0,
1360                                               uint32_t coeff1)
1361{
1362    uint64_t tp0, tp1, tp2, tp3;
1363    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1364    v16u8 out0, out1, out2, out3;
1365    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1366    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1367    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1368    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1369    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1370
1371    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1372    src += (5 * stride);
1373    LD_UB4(src, stride, src5, src6, src7, src8);
1374    LD4(dst, stride, tp0, tp1, tp2, tp3);
1375    INSERT_D2_UB(tp0, tp1, dst0);
1376    INSERT_D2_UB(tp2, tp3, dst1);
1377    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1378    INSERT_D2_UB(tp0, tp1, dst2);
1379    INSERT_D2_UB(tp2, tp3, dst3);
1380    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1381               src0, src1, src2, src3);
1382    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1383               src4, src5, src6, src7);
1384    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1385                coeff_vec, res0, res1, res2, res3);
1386    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1387                coeff_vec, res4, res5, res6, res7);
1388    SLLI_4V(res0, res1, res2, res3, 3);
1389    SLLI_4V(res4, res5, res6, res7, 3);
1390    SRARI_H4_UH(res0, res1, res2, res3, 6);
1391    SRARI_H4_UH(res4, res5, res6, res7, 6);
1392    SAT_UH4_UH(res0, res1, res2, res3, 7);
1393    SAT_UH4_UH(res0, res1, res2, res3, 7);
1394    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1395    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1396    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1397    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1398    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1399}
1400
1401static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1402                                              int32_t stride, uint32_t coeff0,
1403                                              uint32_t coeff1, int32_t height)
1404{
1405    if (4 == height) {
1406        avc_chroma_vt_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1407    } else if (8 == height) {
1408        avc_chroma_vt_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1409    }
1410}
1411
1412static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
1413                                               int32_t stride,
1414                                               uint32_t coef_hor0,
1415                                               uint32_t coef_hor1,
1416                                               uint32_t coef_ver0,
1417                                               uint32_t coef_ver1)
1418{
1419    uint16_t out0, out1;
1420    v16u8 dst0 = { 0 };
1421    v16u8 src0, src1, src2;
1422    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1423    v16i8 res, mask;
1424    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1425    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1426    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1427    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1428    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1429
1430    mask = LD_SB(&chroma_mask_arr[48]);
1431
1432    LD_UB3(src, stride, src0, src1, src2);
1433    out0 = LH(dst);
1434    out1 = LH(dst + stride);
1435    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
1436    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
1437    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1438    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1439    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1440
1441    res_vt0 += res_vt1;
1442    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1443    res_vt0 = __msa_sat_u_h(res_vt0, 7);
1444    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1445    dst0 = __msa_aver_u_b((v16u8) res, dst0);
1446    out0 = __msa_copy_u_h((v8i16) dst0, 0);
1447    out1 = __msa_copy_u_h((v8i16) dst0, 1);
1448
1449    SH(out0, dst);
1450    dst += stride;
1451    SH(out1, dst);
1452}
1453
1454static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
1455                                               int32_t stride,
1456                                               uint32_t coef_hor0,
1457                                               uint32_t coef_hor1,
1458                                               uint32_t coef_ver0,
1459                                               uint32_t coef_ver1)
1460{
1461    uint16_t tp0, tp1, tp2, tp3;
1462    v16u8 src0, src1, src2, src3, src4;
1463    v16u8 tmp0, tmp1, tmp2, tmp3;
1464    v16u8 dst0 = { 0 };
1465    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1466    v16i8 res, mask;
1467    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1468    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1469    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1470    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1471    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1472
1473    mask = LD_SB(&chroma_mask_arr[48]);
1474
1475    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1476    tp0 = LH(dst);
1477    tp1 = LH(dst + stride);
1478    tp2 = LH(dst + 2 * stride);
1479    tp3 = LH(dst + 3 * stride);
1480    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
1481    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
1482    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
1483    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
1484    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1485    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1486    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1487    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1488    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1489
1490    res_vt0 += res_vt1;
1491    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1492    res_vt0 = __msa_sat_u_h(res_vt0, 7);
1493    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1494    dst0 = __msa_aver_u_b((v16u8) res, dst0);
1495
1496    ST_H4(dst0, 0, 1, 2, 3, dst, stride);
1497}
1498
1499static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
1500                                              int32_t stride,
1501                                              uint32_t coef_hor0,
1502                                              uint32_t coef_hor1,
1503                                              uint32_t coef_ver0,
1504                                              uint32_t coef_ver1,
1505                                              int32_t height)
1506{
1507    if (2 == height) {
1508        avc_chroma_hv_and_aver_dst_2x2_msa(src, dst, stride, coef_hor0,
1509                                           coef_hor1, coef_ver0, coef_ver1);
1510    } else if (4 == height) {
1511        avc_chroma_hv_and_aver_dst_2x4_msa(src, dst, stride, coef_hor0,
1512                                           coef_hor1, coef_ver0, coef_ver1);
1513    }
1514}
1515
1516static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
1517                                               int32_t stride,
1518                                               uint32_t coef_hor0,
1519                                               uint32_t coef_hor1,
1520                                               uint32_t coef_ver0,
1521                                               uint32_t coef_ver1)
1522{
1523    uint32_t tp0, tp1;
1524    v16u8 src0, src1, src2;
1525    v16u8 dst0, dst_data = { 0 };
1526    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1527    v16i8 mask;
1528    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1529    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1530    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1531    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1532    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1533
1534    mask = LD_SB(&chroma_mask_arr[0]);
1535
1536    LD_UB3(src, stride, src0, src1, src2);
1537    LW2(dst, stride, tp0, tp1);
1538    INSERT_W2_UB(tp0, tp1, dst_data);
1539    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1540    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1541    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1542
1543    res_vt0 += res_vt1;
1544    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1545    res_vt0 = __msa_sat_u_h(res_vt0, 7);
1546    dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1547    dst0 = __msa_aver_u_b(dst0, dst_data);
1548
1549    ST_W2(dst0, 0, 1, dst, stride);
1550}
1551
1552static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
1553                                               int32_t stride,
1554                                               uint32_t coef_hor0,
1555                                               uint32_t coef_hor1,
1556                                               uint32_t coef_ver0,
1557                                               uint32_t coef_ver1)
1558{
1559    uint32_t tp0, tp1, tp2, tp3;
1560    v16u8 src0, src1, src2, src3, src4;
1561    v16u8 out, dst_data = { 0 };
1562    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1563    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1564    v16i8 mask;
1565    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1566    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1567    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1568    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1569    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1570
1571    mask = LD_SB(&chroma_mask_arr[0]);
1572
1573    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1574    LW4(dst, stride, tp0, tp1, tp2, tp3);
1575    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
1576    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1577    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1578    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1579                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1580                res_hz3);
1581    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1582         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1583    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1584    SRARI_H2_UH(res_vt0, res_vt1, 6);
1585    SAT_UH2_UH(res_vt0, res_vt1, 7);
1586    out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
1587    out = __msa_aver_u_b(out, dst_data);
1588    ST_W4(out, 0, 1, 2, 3, dst, stride);
1589}
1590
1591static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
1592                                               int32_t stride,
1593                                               uint32_t coef_hor0,
1594                                               uint32_t coef_hor1,
1595                                               uint32_t coef_ver0,
1596                                               uint32_t coef_ver1)
1597{
1598    uint32_t tp0, tp1, tp2, tp3;
1599    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1600    v16u8 dst0 = { 0 }, dst1 = { 0 };
1601    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
1602    v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
1603    v16i8 mask;
1604    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1605    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1606    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1607    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1608    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1609
1610    mask = LD_SB(&chroma_mask_arr[0]);
1611
1612    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1613    src += (5 * stride);
1614    LD_UB4(src, stride, src5, src6, src7, src8);
1615    LW4(dst, stride, tp0, tp1, tp2, tp3);
1616    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1617    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1618    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1619    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1620    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1621    VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
1622    VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
1623    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1624                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
1625    DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
1626                coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
1627    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1628         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1629    MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
1630         res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
1631    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1632    ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
1633    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1634    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1635    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
1636    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
1637    ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1638}
1639
1640static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1641                                              int32_t stride,
1642                                              uint32_t coef_hor0,
1643                                              uint32_t coef_hor1,
1644                                              uint32_t coef_ver0,
1645                                              uint32_t coef_ver1,
1646                                              int32_t height)
1647{
1648    if (2 == height) {
1649        avc_chroma_hv_and_aver_dst_4x2_msa(src, dst, stride, coef_hor0,
1650                                           coef_hor1, coef_ver0, coef_ver1);
1651    } else if (4 == height) {
1652        avc_chroma_hv_and_aver_dst_4x4_msa(src, dst, stride, coef_hor0,
1653                                           coef_hor1, coef_ver0, coef_ver1);
1654    } else if (8 == height) {
1655        avc_chroma_hv_and_aver_dst_4x8_msa(src, dst, stride, coef_hor0,
1656                                           coef_hor1, coef_ver0, coef_ver1);
1657    }
1658}
1659
1660static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1661                                               int32_t stride,
1662                                               uint32_t coef_hor0,
1663                                               uint32_t coef_hor1,
1664                                               uint32_t coef_ver0,
1665                                               uint32_t coef_ver1)
1666{
1667    uint64_t tp0, tp1, tp2, tp3;
1668    v16u8 src0, src1, src2, src3, src4, out0, out1;
1669    v8u16 res_hz0, res_hz1, res_hz2;
1670    v8u16 res_hz3, res_hz4;
1671    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1672    v16u8 dst0 = { 0 }, dst1 = { 0 };
1673    v16i8 mask;
1674    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1675    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1676    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1677    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1678    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1679
1680    mask = LD_SB(&chroma_mask_arr[32]);
1681
1682    src0 = LD_UB(src);
1683    src += stride;
1684    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1685    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1686    LD_UB4(src, stride, src1, src2, src3, src4);
1687    src += (4 * stride);
1688    LD4(dst, stride, tp0, tp1, tp2, tp3);
1689    INSERT_D2_UB(tp0, tp1, dst0);
1690    INSERT_D2_UB(tp2, tp3, dst1);
1691    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1692    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1693    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1694                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
1695    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
1696         res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1697    res_vt0 += (res_hz0 * coeff_vt_vec1);
1698    res_vt1 += (res_hz1 * coeff_vt_vec1);
1699    res_vt2 += (res_hz2 * coeff_vt_vec1);
1700    res_vt3 += (res_hz3 * coeff_vt_vec1);
1701    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1702    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1703    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1704    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1705    ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1706}
1707
1708static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1709                                               int32_t stride,
1710                                               uint32_t coef_hor0,
1711                                               uint32_t coef_hor1,
1712                                               uint32_t coef_ver0,
1713                                               uint32_t coef_ver1)
1714{
1715    uint64_t tp0, tp1, tp2, tp3;
1716    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1717    v16u8 out0, out1, out2, out3;
1718    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1719    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
1720    v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
1721    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1722    v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
1723    v16i8 mask;
1724    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1725    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1726    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1727    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1728    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1729
1730    mask = LD_SB(&chroma_mask_arr[32]);
1731
1732    LD_UB5(src, stride, src0, src1, src2, src3, src4);
1733    src += (5 * stride);
1734    LD_UB4(src, stride, src5, src6, src7, src8);
1735    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1736    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1737    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1738    VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
1739    VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
1740    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1741    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1742                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1743                res_hz4);
1744    DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
1745                coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
1746    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1747         coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1748         res_vt3);
1749    MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
1750         coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
1751         res_vt7);
1752    LD4(dst, stride, tp0, tp1, tp2, tp3);
1753    INSERT_D2_UB(tp0, tp1, dst0);
1754    INSERT_D2_UB(tp2, tp3, dst1);
1755    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1756    INSERT_D2_UB(tp0, tp1, dst2);
1757    INSERT_D2_UB(tp2, tp3, dst3);
1758    res_vt0 += (res_hz0 * coeff_vt_vec1);
1759    res_vt1 += (res_hz1 * coeff_vt_vec1);
1760    res_vt2 += (res_hz2 * coeff_vt_vec1);
1761    res_vt3 += (res_hz3 * coeff_vt_vec1);
1762    res_vt4 += (res_hz4 * coeff_vt_vec1);
1763    res_vt5 += (res_hz5 * coeff_vt_vec1);
1764    res_vt6 += (res_hz6 * coeff_vt_vec1);
1765    res_vt7 += (res_hz7 * coeff_vt_vec1);
1766    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1767    SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
1768    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1769    SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
1770    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1771    PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
1772    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1773    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1774    ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1775}
1776
1777static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1778                                              int32_t stride,
1779                                              uint32_t coef_hor0,
1780                                              uint32_t coef_hor1,
1781                                              uint32_t coef_ver0,
1782                                              uint32_t coef_ver1,
1783                                              int32_t height)
1784{
1785    if (4 == height) {
1786        avc_chroma_hv_and_aver_dst_8x4_msa(src, dst, stride, coef_hor0,
1787                                           coef_hor1, coef_ver0, coef_ver1);
1788    } else if (8 == height) {
1789        avc_chroma_hv_and_aver_dst_8x8_msa(src, dst, stride, coef_hor0,
1790                                           coef_hor1, coef_ver0, coef_ver1);
1791    }
1792}
1793
1794static void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1795                            int32_t height)
1796{
1797    uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1798
1799    if (8 == height) {
1800        LW4(src, stride, tp0, tp1, tp2, tp3);
1801        src += 4 * stride;
1802        LW4(src, stride, tp4, tp5, tp6, tp7);
1803        SW4(tp0, tp1, tp2, tp3, dst, stride);
1804        dst += 4 * stride;
1805        SW4(tp4, tp5, tp6, tp7, dst, stride);
1806    } else if (4 == height) {
1807        LW4(src, stride, tp0, tp1, tp2, tp3);
1808        SW4(tp0, tp1, tp2, tp3, dst, stride);
1809    } else if (2 == height) {
1810        LW2(src, stride, tp0, tp1);
1811        SW(tp0, dst);
1812        dst += stride;
1813        SW(tp1, dst);
1814    }
1815}
1816
1817static void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1818                            int32_t height)
1819{
1820    uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
1821
1822    if (8 == height) {
1823        LD4(src, stride, src0, src1, src2, src3);
1824        src += 4 * stride;
1825        LD4(src, stride, src4, src5, src6, src7);
1826        SD4(src0, src1, src2, src3, dst, stride);
1827        dst += 4 * stride;
1828        SD4(src4, src5, src6, src7, dst, stride);
1829    } else if (4 == height) {
1830        LD4(src, stride, src0, src1, src2, src3);
1831        SD4(src0, src1, src2, src3, dst, stride);
1832    }
1833}
1834
1835static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1836                           int32_t height)
1837{
1838    uint32_t tp0, tp1, tp2, tp3;
1839    v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
1840
1841    if (8 == height) {
1842        LW4(src, stride, tp0, tp1, tp2, tp3);
1843        src += 4 * stride;
1844        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1845        LW4(src, stride, tp0, tp1, tp2, tp3);
1846        INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
1847        LW4(dst, stride, tp0, tp1, tp2, tp3);
1848        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1849        LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1850        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1851        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1852        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1853    } else if (4 == height) {
1854        LW4(src, stride, tp0, tp1, tp2, tp3);
1855        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1856        LW4(dst, stride, tp0, tp1, tp2, tp3);
1857        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1858        dst0 = __msa_aver_u_b(src0, dst0);
1859        ST_W4(dst0, 0, 1, 2, 3, dst, stride);
1860    } else if (2 == height) {
1861        LW2(src, stride, tp0, tp1);
1862        INSERT_W2_UB(tp0, tp1, src0);
1863        LW2(dst, stride, tp0, tp1);
1864        INSERT_W2_UB(tp0, tp1, dst0);
1865        dst0 = __msa_aver_u_b(src0, dst0);
1866        ST_W2(dst0, 0, 1, dst, stride);
1867    }
1868}
1869
1870static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1871                           int32_t height)
1872{
1873    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1874    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1875    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1876
1877    if (8 == height) {
1878        LD4(src, stride, tp0, tp1, tp2, tp3);
1879        src += 4 * stride;
1880        LD4(src, stride, tp4, tp5, tp6, tp7);
1881        INSERT_D2_UB(tp0, tp1, src0);
1882        INSERT_D2_UB(tp2, tp3, src1);
1883        INSERT_D2_UB(tp4, tp5, src2);
1884        INSERT_D2_UB(tp6, tp7, src3);
1885        LD4(dst, stride, tp0, tp1, tp2, tp3);
1886        LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1887        INSERT_D2_UB(tp0, tp1, dst0);
1888        INSERT_D2_UB(tp2, tp3, dst1);
1889        INSERT_D2_UB(tp4, tp5, dst2);
1890        INSERT_D2_UB(tp6, tp7, dst3);
1891        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1892                    dst2, dst3);
1893        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1894    } else if (4 == height) {
1895        LD4(src, stride, tp0, tp1, tp2, tp3);
1896        INSERT_D2_UB(tp0, tp1, src0);
1897        INSERT_D2_UB(tp2, tp3, src1);
1898        LD4(dst, stride, tp0, tp1, tp2, tp3);
1899        INSERT_D2_UB(tp0, tp1, dst0);
1900        INSERT_D2_UB(tp2, tp3, dst1);
1901        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1902        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1903    }
1904}
1905
1906void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1907                                ptrdiff_t stride, int height, int x, int y)
1908{
1909    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1910
1911    if (x && y) {
1912        avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1913    } else if (x) {
1914        avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
1915    } else if (y) {
1916        avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
1917    } else {
1918        copy_width8_msa(src, dst, stride, height);
1919    }
1920}
1921
1922void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1923                                ptrdiff_t stride, int height, int x, int y)
1924{
1925    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1926
1927    if (x && y) {
1928        avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1929    } else if (x) {
1930        avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
1931    } else if (y) {
1932        avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
1933    } else {
1934        copy_width4_msa(src, dst, stride, height);
1935    }
1936}
1937
1938void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1939                                ptrdiff_t stride, int height, int x, int y)
1940{
1941    int32_t cnt;
1942
1943    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1944
1945    if (x && y) {
1946        avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1947    } else if (x) {
1948        avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
1949    } else if (y) {
1950        avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
1951    } else {
1952        for (cnt = height; cnt--;) {
1953            *((uint16_t *) dst) = *((uint16_t *) src);
1954
1955            src += stride;
1956            dst += stride;
1957        }
1958    }
1959}
1960
1961void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1962                                ptrdiff_t stride, int height, int x, int y)
1963{
1964    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1965
1966
1967    if (x && y) {
1968        avc_chroma_hv_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), y,
1969                                          (8 - y), height);
1970    } else if (x) {
1971        avc_chroma_hz_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), height);
1972    } else if (y) {
1973        avc_chroma_vt_and_aver_dst_8w_msa(src, dst, stride, y, (8 - y), height);
1974    } else {
1975        avg_width8_msa(src, dst, stride, height);
1976    }
1977}
1978
1979void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1980                                ptrdiff_t stride, int height, int x, int y)
1981{
1982    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1983
1984    if (x && y) {
1985        avc_chroma_hv_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), y,
1986                                          (8 - y), height);
1987    } else if (x) {
1988        avc_chroma_hz_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), height);
1989    } else if (y) {
1990        avc_chroma_vt_and_aver_dst_4w_msa(src, dst, stride, y, (8 - y), height);
1991    } else {
1992        avg_width4_msa(src, dst, stride, height);
1993    }
1994}
1995
1996void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1997                                ptrdiff_t stride, int height, int x, int y)
1998{
1999    int32_t cnt;
2000
2001    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2002
2003    if (x && y) {
2004        avc_chroma_hv_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), y,
2005                                          (8 - y), height);
2006    } else if (x) {
2007        avc_chroma_hz_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), height);
2008    } else if (y) {
2009        avc_chroma_vt_and_aver_dst_2w_msa(src, dst, stride, y, (8 - y), height);
2010    } else {
2011        for (cnt = height; cnt--;) {
2012            dst[0] = (dst[0] + src[0] + 1) >> 1;
2013            dst[1] = (dst[1] + src[1] + 1) >> 1;
2014
2015            src += stride;
2016            dst += stride;
2017        }
2018    }
2019}
2020