1/*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "libavcodec/mips/hevcdsp_mips.h"
23#include "libavcodec/mips/hevc_macros_msa.h"
24
25static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26    /* 8 width cases */
27    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28    /* 4 width cases */
29    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30};
31
32#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,  \
33                                       out0_h, out1_h)                        \
34{                                                                             \
35    v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m;                                 \
36                                                                              \
37    ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m);                              \
38    ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m);                              \
39    DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w,      \
40                wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m);                   \
41    SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w);                    \
42    PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h);          \
43    ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h);          \
44    CLIP_SH2_0_255(out0_h, out1_h);                                           \
45}
46
47#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,  \
48                                       offset_h, rnd_w, out0_h, out1_h,    \
49                                       out2_h, out3_h)                     \
50{                                                                          \
51    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,   \
52                                   out0_h, out1_h);                        \
53    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w,   \
54                                   out2_h, out3_h);                        \
55}
56
57static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
58                                    int32_t src_stride,
59                                    uint8_t *dst,
60                                    int32_t dst_stride,
61                                    int32_t height,
62                                    int32_t weight,
63                                    int32_t offset,
64                                    int32_t rnd_val)
65{
66    uint32_t loop_cnt, tp0, tp1, tp2, tp3;
67    v16i8 zero = { 0 };
68    v16u8 out0, out1;
69    v16i8 src0 = { 0 }, src1 = { 0 };
70    v8i16 dst0, dst1, dst2, dst3, offset_vec;
71    v4i32 weight_vec, rnd_vec;
72
73    weight = weight & 0x0000FFFF;
74    weight_vec = __msa_fill_w(weight);
75    offset_vec = __msa_fill_h(offset);
76    rnd_vec = __msa_fill_w(rnd_val);
77
78    if (2 == height) {
79        v4i32 dst0_r, dst0_l;
80
81        LW2(src, src_stride, tp0, tp1);
82        INSERT_W2_SB(tp0, tp1, src0);
83        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
84        dst0 <<= 6;
85
86        ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
87        DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
88        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
89        dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
90        dst0 += offset_vec;
91        CLIP_SH_0_255(dst0);
92        out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
93        ST_W2(out0, 0, 1, dst, dst_stride);
94    } else if (4 == height) {
95        LW4(src, src_stride, tp0, tp1, tp2, tp3);
96        INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
97        ILVRL_B2_SH(zero, src0, dst0, dst1);
98        SLLI_2V(dst0, dst1, 6);
99        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
100                                       rnd_vec, dst0, dst1);
101        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
102        ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
103    } else if (0 == (height % 8)) {
104        for (loop_cnt = (height >> 3); loop_cnt--;) {
105            LW4(src, src_stride, tp0, tp1, tp2, tp3);
106            src += 4 * src_stride;
107            INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
108            LW4(src, src_stride, tp0, tp1, tp2, tp3);
109            src += 4 * src_stride;
110            INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
111            ILVRL_B2_SH(zero, src0, dst0, dst1);
112            ILVRL_B2_SH(zero, src1, dst2, dst3);
113            SLLI_4V(dst0, dst1, dst2, dst3, 6);
114            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
115                                           offset_vec, rnd_vec, dst0, dst1,
116                                           dst2, dst3);
117            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
118            ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
119            dst += 8 * dst_stride;
120        }
121    }
122}
123
124static void hevc_uniwgt_copy_6w_msa(uint8_t *src,
125                                    int32_t src_stride,
126                                    uint8_t *dst,
127                                    int32_t dst_stride,
128                                    int32_t height,
129                                    int32_t weight,
130                                    int32_t offset,
131                                    int32_t rnd_val)
132{
133    uint32_t loop_cnt;
134    uint64_t tp0, tp1, tp2, tp3;
135    v16i8 zero = { 0 };
136    v16u8 out0, out1, out2, out3;
137    v16i8 src0, src1, src2, src3;
138    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
139    v4i32 weight_vec, rnd_vec;
140
141    weight = weight & 0x0000FFFF;
142    weight_vec = __msa_fill_w(weight);
143    offset_vec = __msa_fill_h(offset);
144    rnd_vec = __msa_fill_w(rnd_val);
145
146    for (loop_cnt = (height >> 3); loop_cnt--;) {
147        LD4(src, src_stride, tp0, tp1, tp2, tp3);
148        src += (4 * src_stride);
149        INSERT_D2_SB(tp0, tp1, src0);
150        INSERT_D2_SB(tp2, tp3, src1);
151        LD4(src, src_stride, tp0, tp1, tp2, tp3);
152        src += (4 * src_stride);
153        INSERT_D2_SB(tp0, tp1, src2);
154        INSERT_D2_SB(tp2, tp3, src3);
155
156        ILVRL_B2_SH(zero, src0, dst0, dst1);
157        ILVRL_B2_SH(zero, src1, dst2, dst3);
158        ILVRL_B2_SH(zero, src2, dst4, dst5);
159        ILVRL_B2_SH(zero, src3, dst6, dst7);
160
161        SLLI_4V(dst0, dst1, dst2, dst3, 6);
162        SLLI_4V(dst4, dst5, dst6, dst7, 6);
163
164        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
165                                       offset_vec, rnd_vec, dst0, dst1, dst2,
166                                       dst3);
167        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
168                                       offset_vec, rnd_vec, dst4, dst5, dst6,
169                                       dst7);
170        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
171        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
172
173        ST_W2(out0, 0, 2, dst, dst_stride);
174        ST_H2(out0, 2, 6, dst + 4, dst_stride);
175        ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
176        ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177        dst += (4 * dst_stride);
178        ST_W2(out2, 0, 2, dst, dst_stride);
179        ST_H2(out2, 2, 6, dst + 4, dst_stride);
180        ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
181        ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
182        dst += (4 * dst_stride);
183    }
184}
185
186static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
187                                    int32_t src_stride,
188                                    uint8_t *dst,
189                                    int32_t dst_stride,
190                                    int32_t height,
191                                    int32_t weight,
192                                    int32_t offset,
193                                    int32_t rnd_val)
194{
195    uint32_t loop_cnt;
196    uint64_t tp0, tp1, tp2, tp3;
197    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
198    v16i8 zero = { 0 };
199    v16u8 out0, out1, out2, out3;
200    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
201    v4i32 weight_vec, rnd_vec;
202
203    weight = weight & 0x0000FFFF;
204    weight_vec = __msa_fill_w(weight);
205    offset_vec = __msa_fill_h(offset);
206    rnd_vec = __msa_fill_w(rnd_val);
207
208    if (2 == height) {
209        LD2(src, src_stride, tp0, tp1);
210        INSERT_D2_SB(tp0, tp1, src0);
211        ILVRL_B2_SH(zero, src0, dst0, dst1);
212        SLLI_2V(dst0, dst1, 6);
213        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
214                                       rnd_vec, dst0, dst1);
215        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
216        ST_D2(out0, 0, 1, dst, dst_stride);
217    } else if (4 == height) {
218        LD4(src, src_stride, tp0, tp1, tp2, tp3);
219        INSERT_D2_SB(tp0, tp1, src0);
220        INSERT_D2_SB(tp2, tp3, src1);
221        ILVRL_B2_SH(zero, src0, dst0, dst1);
222        ILVRL_B2_SH(zero, src1, dst2, dst3);
223        SLLI_4V(dst0, dst1, dst2, dst3, 6);
224        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
225                                       offset_vec, rnd_vec, dst0, dst1, dst2,
226                                       dst3);
227        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
228        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
229    } else if (6 == height) {
230        LD4(src, src_stride, tp0, tp1, tp2, tp3);
231        src += 4 * src_stride;
232        INSERT_D2_SB(tp0, tp1, src0);
233        INSERT_D2_SB(tp2, tp3, src1);
234        LD2(src, src_stride, tp0, tp1);
235        INSERT_D2_SB(tp0, tp1, src2);
236        ILVRL_B2_SH(zero, src0, dst0, dst1);
237        ILVRL_B2_SH(zero, src1, dst2, dst3);
238        ILVRL_B2_SH(zero, src2, dst4, dst5);
239        SLLI_4V(dst0, dst1, dst2, dst3, 6);
240        SLLI_2V(dst4, dst5, 6);
241        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
242                                       offset_vec, rnd_vec, dst0, dst1, dst2,
243                                       dst3);
244        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
245                                       rnd_vec, dst4, dst5);
246        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
247        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
248        ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
249    } else if (0 == height % 8) {
250        for (loop_cnt = (height >> 3); loop_cnt--;) {
251            LD4(src, src_stride, tp0, tp1, tp2, tp3);
252            src += 4 * src_stride;
253            INSERT_D2_SB(tp0, tp1, src0);
254            INSERT_D2_SB(tp2, tp3, src1);
255            LD4(src, src_stride, tp0, tp1, tp2, tp3);
256            src += 4 * src_stride;
257            INSERT_D2_SB(tp0, tp1, src2);
258            INSERT_D2_SB(tp2, tp3, src3);
259
260            ILVRL_B2_SH(zero, src0, dst0, dst1);
261            ILVRL_B2_SH(zero, src1, dst2, dst3);
262            ILVRL_B2_SH(zero, src2, dst4, dst5);
263            ILVRL_B2_SH(zero, src3, dst6, dst7);
264            SLLI_4V(dst0, dst1, dst2, dst3, 6);
265            SLLI_4V(dst4, dst5, dst6, dst7, 6);
266            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
267                                           offset_vec, rnd_vec, dst0, dst1,
268                                           dst2, dst3);
269            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
270                                           offset_vec, rnd_vec, dst4, dst5,
271                                           dst6, dst7);
272            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
273            PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
274            ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
275                  dst, dst_stride);
276            dst += (8 * dst_stride);
277        }
278    }
279}
280
281static void hevc_uniwgt_copy_12w_msa(uint8_t *src,
282                                     int32_t src_stride,
283                                     uint8_t *dst,
284                                     int32_t dst_stride,
285                                     int32_t height,
286                                     int32_t weight,
287                                     int32_t offset,
288                                     int32_t rnd_val)
289{
290    uint32_t loop_cnt;
291    v16u8 out0, out1, out2;
292    v16i8 src0, src1, src2, src3;
293    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
294    v8i16 offset_vec;
295    v16i8 zero = { 0 };
296    v4i32 weight_vec, rnd_vec;
297
298    weight = weight & 0x0000FFFF;
299    weight_vec = __msa_fill_w(weight);
300    offset_vec = __msa_fill_h(offset);
301    rnd_vec = __msa_fill_w(rnd_val);
302
303    for (loop_cnt = 4; loop_cnt--;) {
304        LD_SB4(src, src_stride, src0, src1, src2, src3);
305        src += (4 * src_stride);
306        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
307                   dst0, dst1, dst2, dst3);
308
309        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
310        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
311        SLLI_4V(dst0, dst1, dst2, dst3, 6);
312        SLLI_2V(dst4, dst5, 6);
313        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
314                                       offset_vec, rnd_vec, dst0, dst1, dst2,
315                                       dst3);
316        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
317                                       rnd_vec, dst4, dst5);
318
319        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
320        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
321        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
322        dst += (4 * dst_stride);
323    }
324}
325
326static void hevc_uniwgt_copy_16w_msa(uint8_t *src,
327                                     int32_t src_stride,
328                                     uint8_t *dst,
329                                     int32_t dst_stride,
330                                     int32_t height,
331                                     int32_t weight,
332                                     int32_t offset,
333                                     int32_t rnd_val)
334{
335    uint32_t loop_cnt;
336    v16u8 out0, out1, out2, out3;
337    v16i8 src0, src1, src2, src3;
338    v16i8 zero = { 0 };
339    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
340    v4i32 weight_vec, rnd_vec;
341
342    weight = weight & 0x0000FFFF;
343    weight_vec = __msa_fill_w(weight);
344    offset_vec = __msa_fill_h(offset);
345    rnd_vec = __msa_fill_w(rnd_val);
346
347    for (loop_cnt = height >> 2; loop_cnt--;) {
348        LD_SB4(src, src_stride, src0, src1, src2, src3);
349        src += (4 * src_stride);
350        ILVRL_B2_SH(zero, src0, dst0, dst1);
351        ILVRL_B2_SH(zero, src1, dst2, dst3);
352        ILVRL_B2_SH(zero, src2, dst4, dst5);
353        ILVRL_B2_SH(zero, src3, dst6, dst7);
354        SLLI_4V(dst0, dst1, dst2, dst3, 6);
355        SLLI_4V(dst4, dst5, dst6, dst7, 6);
356        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
357                                       offset_vec, rnd_vec, dst0, dst1, dst2,
358                                       dst3);
359        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
360                                       offset_vec, rnd_vec, dst4, dst5, dst6,
361                                       dst7);
362        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
363        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
364        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
365        dst += (4 * dst_stride);
366    }
367}
368
369static void hevc_uniwgt_copy_24w_msa(uint8_t *src,
370                                     int32_t src_stride,
371                                     uint8_t *dst,
372                                     int32_t dst_stride,
373                                     int32_t height,
374                                     int32_t weight,
375                                     int32_t offset,
376                                     int32_t rnd_val)
377{
378    uint32_t loop_cnt;
379    v16u8 out0, out1, out2, out3, out4, out5;
380    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
381    v16i8 zero = { 0 };
382    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
383    v8i16 dst8, dst9, dst10, dst11;
384    v4i32 weight_vec, rnd_vec;
385
386    weight = weight & 0x0000FFFF;
387    weight_vec = __msa_fill_w(weight);
388    offset_vec = __msa_fill_h(offset);
389    rnd_vec = __msa_fill_w(rnd_val);
390
391    for (loop_cnt = (height >> 2); loop_cnt--;) {
392        LD_SB4(src, src_stride, src0, src1, src4, src5);
393        LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
394        src += (4 * src_stride);
395
396        ILVRL_B2_SH(zero, src0, dst0, dst1);
397        ILVRL_B2_SH(zero, src1, dst2, dst3);
398        ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
399        ILVRL_B2_SH(zero, src4, dst6, dst7);
400        ILVRL_B2_SH(zero, src5, dst8, dst9);
401        ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
402        SLLI_4V(dst0, dst1, dst2, dst3, 6);
403        SLLI_4V(dst4, dst5, dst6, dst7, 6);
404        SLLI_4V(dst8, dst9, dst10, dst11, 6);
405        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
406                                       offset_vec, rnd_vec, dst0, dst1, dst2,
407                                       dst3);
408        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
409                                       offset_vec, rnd_vec, dst4, dst5, dst6,
410                                       dst7);
411        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
412                                       offset_vec, rnd_vec, dst8, dst9, dst10,
413                                       dst11);
414        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
415        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
416        ST_UB4(out0, out1, out3, out4, dst, dst_stride);
417        ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
418        dst += (4 * dst_stride);
419    }
420}
421
422static void hevc_uniwgt_copy_32w_msa(uint8_t *src,
423                                     int32_t src_stride,
424                                     uint8_t *dst,
425                                     int32_t dst_stride,
426                                     int32_t height,
427                                     int32_t weight,
428                                     int32_t offset,
429                                     int32_t rnd_val)
430{
431    uint32_t loop_cnt;
432    v16u8 out0, out1, out2, out3;
433    v16i8 src0, src1, src2, src3;
434    v16i8 zero = { 0 };
435    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
436    v4i32 weight_vec, rnd_vec;
437
438    weight = weight & 0x0000FFFF;
439    weight_vec = __msa_fill_w(weight);
440    offset_vec = __msa_fill_h(offset);
441    rnd_vec = __msa_fill_w(rnd_val);
442
443    for (loop_cnt = (height >> 1); loop_cnt--;) {
444        LD_SB2(src, src_stride, src0, src1);
445        LD_SB2(src + 16, src_stride, src2, src3);
446        src += (2 * src_stride);
447
448        ILVRL_B2_SH(zero, src0, dst0, dst1);
449        ILVRL_B2_SH(zero, src1, dst2, dst3);
450        ILVRL_B2_SH(zero, src2, dst4, dst5);
451        ILVRL_B2_SH(zero, src3, dst6, dst7);
452        SLLI_4V(dst0, dst1, dst2, dst3, 6);
453        SLLI_4V(dst4, dst5, dst6, dst7, 6);
454        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
455                                       offset_vec, rnd_vec, dst0, dst1, dst2,
456                                       dst3);
457        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
458                                       offset_vec, rnd_vec, dst4, dst5, dst6,
459                                       dst7);
460        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
461        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
462        ST_UB2(out0, out1, dst, dst_stride);
463        ST_UB2(out2, out3, dst + 16, dst_stride);
464        dst += (2 * dst_stride);
465    }
466}
467
468static void hevc_uniwgt_copy_48w_msa(uint8_t *src,
469                                     int32_t src_stride,
470                                     uint8_t *dst,
471                                     int32_t dst_stride,
472                                     int32_t height,
473                                     int32_t weight,
474                                     int32_t offset,
475                                     int32_t rnd_val)
476{
477    uint32_t loop_cnt;
478    v16u8 out0, out1, out2, out3, out4, out5;
479    v16i8 src0, src1, src2, src3, src4, src5;
480    v16i8 zero = { 0 };
481    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
482    v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
483    v4i32 weight_vec, rnd_vec;
484
485    weight = weight & 0x0000FFFF;
486    weight_vec = __msa_fill_w(weight);
487    offset_vec = __msa_fill_h(offset);
488    rnd_vec = __msa_fill_w(rnd_val);
489
490    for (loop_cnt = (height >> 1); loop_cnt--;) {
491        LD_SB3(src, 16, src0, src1, src2);
492        src += src_stride;
493        LD_SB3(src, 16, src3, src4, src5);
494        src += src_stride;
495
496        ILVRL_B2_SH(zero, src0, dst0, dst1);
497        ILVRL_B2_SH(zero, src1, dst2, dst3);
498        ILVRL_B2_SH(zero, src2, dst4, dst5);
499        ILVRL_B2_SH(zero, src3, dst6, dst7);
500        ILVRL_B2_SH(zero, src4, dst8, dst9);
501        ILVRL_B2_SH(zero, src5, dst10, dst11);
502        SLLI_4V(dst0, dst1, dst2, dst3, 6);
503        SLLI_4V(dst4, dst5, dst6, dst7, 6);
504        SLLI_4V(dst8, dst9, dst10, dst11, 6);
505        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
506                                       offset_vec, rnd_vec, dst0, dst1, dst2,
507                                       dst3);
508        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
509                                       offset_vec, rnd_vec, dst4, dst5, dst6,
510                                       dst7);
511        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
512                                       offset_vec, rnd_vec, dst8, dst9, dst10,
513                                       dst11);
514        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
515        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
516        ST_UB2(out0, out1, dst, 16);
517        ST_UB(out2, dst + 32);
518        dst += dst_stride;
519        ST_UB2(out3, out4, dst, 16);
520        ST_UB(out5, dst + 32);
521        dst += dst_stride;
522    }
523}
524
525static void hevc_uniwgt_copy_64w_msa(uint8_t *src,
526                                     int32_t src_stride,
527                                     uint8_t *dst,
528                                     int32_t dst_stride,
529                                     int32_t height,
530                                     int32_t weight,
531                                     int32_t offset,
532                                     int32_t rnd_val)
533{
534    uint32_t loop_cnt;
535    v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
536    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
537    v16i8 zero = { 0 };
538    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
539    v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
540    v4i32 weight_vec, rnd_vec;
541
542    weight = weight & 0x0000FFFF;
543    weight_vec = __msa_fill_w(weight);
544    offset_vec = __msa_fill_h(offset);
545    rnd_vec = __msa_fill_w(rnd_val);
546
547    for (loop_cnt = (height >> 1); loop_cnt--;) {
548        LD_SB4(src, 16, src0, src1, src2, src3);
549        src += src_stride;
550        LD_SB4(src, 16, src4, src5, src6, src7);
551        src += src_stride;
552
553        ILVRL_B2_SH(zero, src0, dst0, dst1);
554        ILVRL_B2_SH(zero, src1, dst2, dst3);
555        ILVRL_B2_SH(zero, src2, dst4, dst5);
556        ILVRL_B2_SH(zero, src3, dst6, dst7);
557        ILVRL_B2_SH(zero, src4, dst8, dst9);
558        ILVRL_B2_SH(zero, src5, dst10, dst11);
559        ILVRL_B2_SH(zero, src6, dst12, dst13);
560        ILVRL_B2_SH(zero, src7, dst14, dst15);
561        SLLI_4V(dst0, dst1, dst2, dst3, 6);
562        SLLI_4V(dst4, dst5, dst6, dst7, 6);
563        SLLI_4V(dst8, dst9, dst10, dst11, 6);
564        SLLI_4V(dst12, dst13, dst14, dst15, 6);
565        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
566                                       offset_vec, rnd_vec, dst0, dst1, dst2,
567                                       dst3);
568        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
569                                       offset_vec, rnd_vec, dst4, dst5, dst6,
570                                       dst7);
571        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
572                                       offset_vec, rnd_vec, dst8, dst9, dst10,
573                                       dst11);
574        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
575                                       offset_vec, rnd_vec, dst12, dst13, dst14,
576                                       dst15);
577        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
578        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
579        PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
580        PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
581        ST_UB4(out0, out1, out2, out3, dst, 16);
582        dst += dst_stride;
583        ST_UB4(out4, out5, out6, out7, dst, 16);
584        dst += dst_stride;
585    }
586}
587
588static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
589                                     int32_t src_stride,
590                                     uint8_t *dst,
591                                     int32_t dst_stride,
592                                     const int8_t *filter,
593                                     int32_t height,
594                                     int32_t weight,
595                                     int32_t offset,
596                                     int32_t rnd_val)
597{
598    uint32_t loop_cnt;
599    v16u8 out0, out1;
600    v8i16 filt0, filt1, filt2, filt3;
601    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
602    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
603    v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
604    v8i16 filter_vec, dst01, dst23, dst45, dst67;
605    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
606    v4i32 weight_vec, rnd_vec;
607
608    src -= 3;
609    weight = weight & 0x0000FFFF;
610
611    weight_vec = __msa_fill_w(weight);
612    rnd_vec = __msa_fill_w(rnd_val);
613
614    weight *= 128;
615    rnd_val -= 6;
616
617    weight_vec_h = __msa_fill_h(weight);
618    offset_vec = __msa_fill_h(offset);
619    denom_vec = __msa_fill_h(rnd_val);
620
621    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
622    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
623
624    filter_vec = LD_SH(filter);
625    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
626
627    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
628    mask1 = mask0 + 2;
629    mask2 = mask0 + 4;
630    mask3 = mask0 + 6;
631
632    for (loop_cnt = (height >> 3); loop_cnt--;) {
633        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
634        src += (8 * src_stride);
635        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
636
637        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
638                   vec0, vec1, vec2, vec3);
639        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
640                   vec4, vec5, vec6, vec7);
641        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
642                   vec8, vec9, vec10, vec11);
643        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
644                   vec12, vec13, vec14, vec15);
645        dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
646                                  filt3);
647        dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
648                                  filt3);
649        dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
650                                  filt3);
651        dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
652                                  filt2, filt3);
653
654        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
655                                       offset_vec, rnd_vec, dst0, dst1, dst2,
656                                       dst3);
657
658        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
659        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
660        dst += (8 * dst_stride);
661    }
662}
663
664static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
665                                     int32_t src_stride,
666                                     uint8_t *dst,
667                                     int32_t dst_stride,
668                                     const int8_t *filter,
669                                     int32_t height,
670                                     int32_t weight,
671                                     int32_t offset,
672                                     int32_t rnd_val)
673{
674    uint32_t loop_cnt;
675    v16u8 out0, out1;
676    v16i8 src0, src1, src2, src3;
677    v8i16 filt0, filt1, filt2, filt3;
678    v16i8 mask0, mask1, mask2, mask3;
679    v8i16 filter_vec;
680    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
681    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
682    v8i16 dst0, dst1, dst2, dst3;
683    v8i16 weight_vec_h, offset_vec, denom_vec;
684    v4i32 weight_vec, rnd_vec;
685
686    src -= 3;
687    weight = weight & 0x0000FFFF;
688
689    weight_vec = __msa_fill_w(weight);
690    rnd_vec = __msa_fill_w(rnd_val);
691
692    weight *= 128;
693    rnd_val -= 6;
694
695    weight_vec_h = __msa_fill_h(weight);
696    offset_vec = __msa_fill_h(offset);
697    denom_vec = __msa_fill_h(rnd_val);
698
699    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
700    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
701
702    filter_vec = LD_SH(filter);
703    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
704
705    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
706    mask1 = mask0 + 2;
707    mask2 = mask0 + 4;
708    mask3 = mask0 + 6;
709
710    for (loop_cnt = (height >> 2); loop_cnt--;) {
711        LD_SB4(src, src_stride, src0, src1, src2, src3);
712        src += (4 * src_stride);
713        XORI_B4_128_SB(src0, src1, src2, src3);
714
715        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
716                   vec0, vec1, vec2, vec3);
717        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
718                   vec4, vec5, vec6, vec7);
719        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
720                   vec8, vec9, vec10, vec11);
721        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
722                   vec12, vec13, vec14, vec15);
723        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
724                                 filt3);
725        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
726                                 filt3);
727        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
728                                 filt3);
729        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
730                                 filt2, filt3);
731
732        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
733                                       offset_vec, rnd_vec, dst0, dst1, dst2,
734                                       dst3);
735
736        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
737        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
738        dst += (4 * dst_stride);
739    }
740}
741
742static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
743                                      int32_t src_stride,
744                                      uint8_t *dst,
745                                      int32_t dst_stride,
746                                      const int8_t *filter,
747                                      int32_t height,
748                                      int32_t weight,
749                                      int32_t offset,
750                                      int32_t rnd_val)
751{
752    uint32_t loop_cnt;
753    v16u8 out0, out1, out2;
754    v8i16 filt0, filt1, filt2, filt3;
755    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
756    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
757    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
758    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
759    v8i16 filter_vec;
760    v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
761    v8i16 weight_vec_h, offset_vec, denom_vec;
762    v4i32 weight_vec, rnd_vec;
763
764    src -= 3;
765    weight = weight & 0x0000FFFF;
766
767    weight_vec = __msa_fill_w(weight);
768    rnd_vec = __msa_fill_w(rnd_val);
769
770    weight *= 128;
771    rnd_val -= 6;
772
773    weight_vec_h = __msa_fill_h(weight);
774    offset_vec = __msa_fill_h(offset);
775    denom_vec = __msa_fill_h(rnd_val);
776
777    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
778    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
779
780    filter_vec = LD_SH(filter);
781    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
782
783    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
784    mask1 = mask0 + 2;
785    mask2 = mask0 + 4;
786    mask3 = mask0 + 6;
787    mask4 = LD_SB(&ff_hevc_mask_arr[16]);
788    mask5 = mask4 + 2;
789    mask6 = mask4 + 4;
790    mask7 = mask4 + 6;
791
792    for (loop_cnt = (height >> 2); loop_cnt--;) {
793        LD_SB4(src, src_stride, src0, src1, src2, src3);
794        LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
795        src += (4 * src_stride);
796        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
797
798        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
799                   vec0, vec1, vec2, vec3);
800        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
801                   vec4, vec5, vec6, vec7);
802        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
803                   vec8, vec9, vec10, vec11);
804        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
805                   vec12, vec13, vec14, vec15);
806        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
807                                 filt3);
808        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
809                                 filt3);
810        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
811                                 filt3);
812        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
813                                 filt2, filt3);
814        VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
815                   vec0, vec1, vec2, vec3);
816        VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
817                   vec4, vec5, vec6, vec7);
818        dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
819                                  filt3);
820        dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
821                                  filt3);
822
823        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
824                                       offset_vec, rnd_vec, dst0, dst1, dst2,
825                                       dst3);
826        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec,
827                                       rnd_vec, dst4, dst5);
828
829        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
830        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
831        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
832        dst += (4 * dst_stride);
833    }
834}
835
836static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src,
837                                      int32_t src_stride,
838                                      uint8_t *dst,
839                                      int32_t dst_stride,
840                                      const int8_t *filter,
841                                      int32_t height,
842                                      int32_t weight,
843                                      int32_t offset,
844                                      int32_t rnd_val)
845{
846    uint32_t loop_cnt;
847    v16u8 out0, out1;
848    v16i8 src0, src1, src2, src3;
849    v8i16 filt0, filt1, filt2, filt3;
850    v16i8 mask0, mask1, mask2, mask3;
851    v8i16 filter_vec;
852    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
853    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
854    v8i16 dst0, dst1, dst2, dst3;
855    v8i16 weight_vec_h, offset_vec, denom_vec;
856    v4i32 weight_vec, rnd_vec;
857
858    src -= 3;
859
860    weight_vec = __msa_fill_w(weight);
861    rnd_vec = __msa_fill_w(rnd_val);
862
863    weight *= 128;
864    rnd_val -= 6;
865
866    weight_vec_h = __msa_fill_h(weight);
867    offset_vec = __msa_fill_h(offset);
868    denom_vec = __msa_fill_h(rnd_val);
869
870    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
871    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
872
873    filter_vec = LD_SH(filter);
874    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
875
876    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
877    mask1 = mask0 + 2;
878    mask2 = mask0 + 4;
879    mask3 = mask0 + 6;
880
881    for (loop_cnt = (height >> 1); loop_cnt--;) {
882        LD_SB2(src, src_stride, src0, src2);
883        LD_SB2(src + 8, src_stride, src1, src3);
884        src += (2 * src_stride);
885        XORI_B4_128_SB(src0, src1, src2, src3);
886
887        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
888                   vec0, vec1, vec2, vec3);
889        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
890                   vec4, vec5, vec6, vec7);
891        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
892                   vec8, vec9, vec10, vec11);
893        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
894                   vec12, vec13, vec14, vec15);
895        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
896                                 filt3);
897        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
898                                 filt3);
899        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
900                                 filt3);
901        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
902                                 filt2, filt3);
903
904        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
905                                       offset_vec, rnd_vec, dst0, dst1, dst2,
906                                       dst3);
907
908        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
909        ST_UB2(out0, out1, dst, dst_stride);
910        dst += (2 * dst_stride);
911    }
912}
913
914static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
915                                      int32_t src_stride,
916                                      uint8_t *dst,
917                                      int32_t dst_stride,
918                                      const int8_t *filter,
919                                      int32_t height,
920                                      int32_t weight,
921                                      int32_t offset,
922                                      int32_t rnd_val)
923{
924    uint32_t loop_cnt;
925    v16u8 out0, out1, out2;
926    v16i8 src0, src1, src2, src3;
927    v8i16 filt0, filt1, filt2, filt3;
928    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
929    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
930    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
931    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
932    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
933    v4i32 weight_vec, rnd_vec;
934
935    src -= 3;
936
937    weight_vec = __msa_fill_w(weight);
938    rnd_vec = __msa_fill_w(rnd_val);
939
940    weight *= 128;
941    rnd_val -= 6;
942
943    weight_vec_h = __msa_fill_h(weight);
944    offset_vec = __msa_fill_h(offset);
945    denom_vec = __msa_fill_h(rnd_val);
946
947    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
948    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
949
950    filter_vec = LD_SH(filter);
951    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
952
953    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
954    mask1 = mask0 + 2;
955    mask2 = mask0 + 4;
956    mask3 = mask0 + 6;
957    mask4 = mask0 + 8;
958    mask5 = mask0 + 10;
959    mask6 = mask0 + 12;
960    mask7 = mask0 + 14;
961
962    for (loop_cnt = 16; loop_cnt--;) {
963        LD_SB2(src, 16, src0, src1);
964        src += src_stride;
965        LD_SB2(src, 16, src2, src3);
966        src += src_stride;
967        XORI_B4_128_SB(src0, src1, src2, src3);
968        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
969                   vec0, vec1, vec2, vec3);
970        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
971                   vec4, vec5, vec6, vec7);
972        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
973                   vec8, vec9, vec10, vec11);
974        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
975                   vec12, vec13, vec14, vec15);
976        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
977                                 filt3);
978        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
979                                 filt3);
980        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
981                                 filt3);
982        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
983                                 filt2, filt3);
984
985        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
986                   vec0, vec1, vec2, vec3);
987        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
988                   vec4, vec5, vec6, vec7);
989        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
990                                 filt3);
991        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
992                                 filt3);
993
994        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
995                                       offset_vec, rnd_vec, dst0, dst1, dst2,
996                                       dst3);
997        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
998                                       rnd_vec, dst4, dst5);
999
1000        PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
1001        ST_UB2(out0, out1, dst, dst_stride);
1002        ST_D2(out2, 0, 1, dst + 16, dst_stride);
1003        dst += (2 * dst_stride);
1004    }
1005}
1006
1007static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src,
1008                                      int32_t src_stride,
1009                                      uint8_t *dst,
1010                                      int32_t dst_stride,
1011                                      const int8_t *filter,
1012                                      int32_t height,
1013                                      int32_t weight,
1014                                      int32_t offset,
1015                                      int32_t rnd_val)
1016{
1017    uint32_t loop_cnt;
1018    v16u8 out0, out1, out2, out3;
1019    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1020    v8i16 filt0, filt1, filt2, filt3;
1021    v16i8 mask0, mask1, mask2, mask3;
1022    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1023    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1024    v8i16 filter_vec;
1025    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1026    v8i16 weight_vec_h, offset_vec, denom_vec;
1027    v4i32 weight_vec, rnd_vec;
1028
1029    src -= 3;
1030
1031    weight_vec = __msa_fill_w(weight);
1032    rnd_vec = __msa_fill_w(rnd_val);
1033
1034    weight *= 128;
1035    rnd_val -= 6;
1036
1037    weight_vec_h = __msa_fill_h(weight);
1038    offset_vec = __msa_fill_h(offset);
1039    denom_vec = __msa_fill_h(rnd_val);
1040
1041    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1042    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1043
1044    filter_vec = LD_SH(filter);
1045    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1046
1047    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1048    mask1 = mask0 + 2;
1049    mask2 = mask0 + 4;
1050    mask3 = mask0 + 6;
1051
1052    for (loop_cnt = height >> 1; loop_cnt--;) {
1053        LD_SB4(src, 8, src0, src1, src2, src3);
1054        src += src_stride;
1055        LD_SB4(src, 8, src4, src5, src6, src7);
1056        src += src_stride;
1057        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1058
1059        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1060                   vec0, vec1, vec2, vec3);
1061        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1062                   vec4, vec5, vec6, vec7);
1063        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1064                   vec8, vec9, vec10, vec11);
1065        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1066                   vec12, vec13, vec14, vec15);
1067        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1068                                 filt3);
1069        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1070                                 filt3);
1071        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1072                                 filt3);
1073        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1074                                 filt2, filt3);
1075
1076        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1077                   vec0, vec1, vec2, vec3);
1078        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1079                   vec4, vec5, vec6, vec7);
1080        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1081                   vec8, vec9, vec10, vec11);
1082        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1083                   vec12, vec13, vec14, vec15);
1084        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1085                                 filt3);
1086        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1087                                 filt3);
1088        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1089                                 filt3);
1090        dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1091                                 filt2, filt3);
1092
1093        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1094                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1095                                       dst3);
1096        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1097                                       offset_vec, rnd_vec, dst4, dst5, dst6,
1098                                       dst7);
1099
1100        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1101        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1102        ST_UB2(out0, out1, dst, 16);
1103        dst += dst_stride;
1104        ST_UB2(out2, out3, dst, 16);
1105        dst += dst_stride;
1106    }
1107}
1108
1109static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src,
1110                                      int32_t src_stride,
1111                                      uint8_t *dst,
1112                                      int32_t dst_stride,
1113                                      const int8_t *filter,
1114                                      int32_t height,
1115                                      int32_t weight,
1116                                      int32_t offset,
1117                                      int32_t rnd_val)
1118{
1119    uint32_t loop_cnt;
1120    v16u8 out0, out1, out2;
1121    v16i8 src0, src1, src2, src3;
1122    v8i16 filt0, filt1, filt2, filt3;
1123    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1124    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1125    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1126    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1127    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1128    v4i32 weight_vec, rnd_vec;
1129
1130    src -= 3;
1131
1132    weight = weight & 0x0000FFFF;
1133    weight_vec = __msa_fill_w(weight);
1134    rnd_vec = __msa_fill_w(rnd_val);
1135
1136    weight *= 128;
1137    rnd_val -= 6;
1138
1139    weight_vec_h = __msa_fill_h(weight);
1140    offset_vec = __msa_fill_h(offset);
1141    denom_vec = __msa_fill_h(rnd_val);
1142
1143    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1144    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1145
1146    filter_vec = LD_SH(filter);
1147    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1148
1149    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1150    mask1 = mask0 + 2;
1151    mask2 = mask0 + 4;
1152    mask3 = mask0 + 6;
1153    mask4 = mask0 + 8;
1154    mask5 = mask0 + 10;
1155    mask6 = mask0 + 12;
1156    mask7 = mask0 + 14;
1157
1158    for (loop_cnt = 64; loop_cnt--;) {
1159        LD_SB3(src, 16, src0, src1, src2);
1160        src3 = LD_SB(src + 40);
1161        src += src_stride;
1162        XORI_B4_128_SB(src0, src1, src2, src3);
1163
1164        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1165                   vec0, vec1, vec2, vec3);
1166        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1167                   vec4, vec5, vec6, vec7);
1168        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1169                   vec8, vec9, vec10, vec11);
1170        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1171                   vec12, vec13, vec14, vec15);
1172        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1173                                 filt3);
1174        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1175                                 filt3);
1176        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1177                                 filt3);
1178        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1179                                 filt2, filt3);
1180
1181        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1182                   vec0, vec1, vec2, vec3);
1183        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1184                   vec4, vec5, vec6, vec7);
1185        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1186                                 filt3);
1187        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1188                                 filt3);
1189
1190        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1191                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1192                                       dst3);
1193        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1194                                       rnd_vec, dst4, dst5);
1195
1196        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1197        ST_UB2(out0, out1, dst, 16);
1198        ST_UB(out2, dst + 32);
1199        dst += dst_stride;
1200    }
1201}
1202
1203static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src,
1204                                      int32_t src_stride,
1205                                      uint8_t *dst,
1206                                      int32_t dst_stride,
1207                                      const int8_t *filter,
1208                                      int32_t height,
1209                                      int32_t weight,
1210                                      int32_t offset,
1211                                      int32_t rnd_val)
1212{
1213    uint8_t *src_tmp;
1214    uint8_t *dst_tmp;
1215    uint32_t loop_cnt, cnt;
1216    v16u8 out0, out1;
1217    v16i8 src0, src1, src2;
1218    v8i16 filt0, filt1, filt2, filt3;
1219    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1220    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1221    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1222    v8i16 dst0, dst1, dst2, dst3;
1223    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1224    v4i32 weight_vec, rnd_vec;
1225
1226    src -= 3;
1227
1228    weight_vec = __msa_fill_w(weight);
1229    rnd_vec = __msa_fill_w(rnd_val);
1230
1231    weight *= 128;
1232    rnd_val -= 6;
1233
1234    weight_vec_h = __msa_fill_h(weight);
1235    offset_vec = __msa_fill_h(offset);
1236    denom_vec = __msa_fill_h(rnd_val);
1237
1238    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1239    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1240
1241    filter_vec = LD_SH(filter);
1242    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1243
1244    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1245    mask1 = mask0 + 2;
1246    mask2 = mask0 + 4;
1247    mask3 = mask0 + 6;
1248    mask4 = mask0 + 8;
1249    mask5 = mask0 + 10;
1250    mask6 = mask0 + 12;
1251    mask7 = mask0 + 14;
1252
1253    for (loop_cnt = height; loop_cnt--;) {
1254        src_tmp = src;
1255        dst_tmp = dst;
1256
1257        for (cnt = 2; cnt--;) {
1258            LD_SB2(src_tmp, 16, src0, src1);
1259            src2 = LD_SB(src_tmp + 24);
1260            src_tmp += 32;
1261            XORI_B3_128_SB(src0, src1, src2);
1262
1263            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1264                       vec0, vec1, vec2, vec3);
1265            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1266                       vec4, vec5, vec6, vec7);
1267            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1268                       vec8, vec9, vec10, vec11);
1269            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1270                       vec12, vec13, vec14, vec15);
1271            dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1272                                     filt2, filt3);
1273            dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1,
1274                                     filt2, filt3);
1275            dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1276                                     filt2, filt3);
1277            dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1278                                     filt2, filt3);
1279
1280            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1281                                           offset_vec, rnd_vec, dst0, dst1,
1282                                           dst2, dst3);
1283
1284            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1285            ST_UB2(out0, out1, dst_tmp, 16);
1286            dst_tmp += 32;
1287        }
1288
1289        src += src_stride;
1290        dst += dst_stride;
1291    }
1292}
1293
1294static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
1295                                     int32_t src_stride,
1296                                     uint8_t *dst,
1297                                     int32_t dst_stride,
1298                                     const int8_t *filter,
1299                                     int32_t height,
1300                                     int32_t weight,
1301                                     int32_t offset,
1302                                     int32_t rnd_val)
1303{
1304    int32_t loop_cnt;
1305    v16u8 out0, out1;
1306    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1307    v16i8 src9, src10, src11, src12, src13, src14;
1308    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1309    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1310    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1311    v16i8 src2110, src4332, src6554, src8776, src10998;
1312    v16i8 src12111110, src14131312;
1313    v8i16 filter_vec, dst01, dst23, dst45, dst67;
1314    v8i16 filt0, filt1, filt2, filt3;
1315    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1316    v4i32 weight_vec, rnd_vec;
1317
1318    src -= (3 * src_stride);
1319
1320
1321    weight_vec = __msa_fill_w(weight);
1322    rnd_vec = __msa_fill_w(rnd_val);
1323
1324    weight *= 128;
1325    rnd_val -= 6;
1326
1327    weight_vec_h = __msa_fill_h(weight);
1328    offset_vec = __msa_fill_h(offset);
1329    denom_vec = __msa_fill_h(rnd_val);
1330
1331    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1332    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1333
1334    filter_vec = LD_SH(filter);
1335    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1336
1337    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1338    src += (7 * src_stride);
1339
1340    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1341               src10_r, src32_r, src54_r, src21_r);
1342
1343    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1344
1345    ILVR_D3_SB(src21_r, src10_r, src43_r,
1346               src32_r, src65_r, src54_r, src2110, src4332, src6554);
1347
1348    XORI_B3_128_SB(src2110, src4332, src6554);
1349
1350    for (loop_cnt = (height >> 3); loop_cnt--;) {
1351        LD_SB8(src, src_stride,
1352               src7, src8, src9, src10, src11, src12, src13, src14);
1353        src += (8 * src_stride);
1354        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1355                   src76_r, src87_r, src98_r, src109_r);
1356        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1357                   src1110_r, src1211_r, src1312_r, src1413_r);
1358        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1359                   src1413_r, src1312_r,
1360                   src8776, src10998, src12111110, src14131312);
1361        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1362        dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1363                                  filt1, filt2, filt3);
1364        dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1365                                  filt1, filt2, filt3);
1366        dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110,
1367                                  filt0, filt1, filt2, filt3);
1368        dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312,
1369                                  filt0, filt1, filt2, filt3);
1370
1371        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
1372                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1373                                       dst3);
1374
1375        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1376        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1377        dst += (8 * dst_stride);
1378
1379        src2110 = src10998;
1380        src4332 = src12111110;
1381        src6554 = src14131312;
1382        src6 = src14;
1383    }
1384}
1385
1386static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
1387                                     int32_t src_stride,
1388                                     uint8_t *dst,
1389                                     int32_t dst_stride,
1390                                     const int8_t *filter,
1391                                     int32_t height,
1392                                     int32_t weight,
1393                                     int32_t offset,
1394                                     int32_t rnd_val)
1395{
1396    int32_t loop_cnt;
1397    v16u8 out0, out1;
1398    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1399    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1400    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1401    v8i16 filt0, filt1, filt2, filt3;
1402    v8i16 filter_vec;
1403    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1404    v4i32 weight_vec, rnd_vec;
1405
1406    src -= (3 * src_stride);
1407
1408    weight_vec = __msa_fill_w(weight);
1409    rnd_vec = __msa_fill_w(rnd_val);
1410
1411    weight *= 128;
1412    rnd_val -= 6;
1413
1414    weight_vec_h = __msa_fill_h(weight);
1415    offset_vec = __msa_fill_h(offset);
1416    denom_vec = __msa_fill_h(rnd_val);
1417
1418    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1419    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1420
1421    filter_vec = LD_SH(filter);
1422    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1423
1424    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1425    src += (7 * src_stride);
1426    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1427
1428    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1429               src10_r, src32_r, src54_r, src21_r);
1430    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1431
1432    for (loop_cnt = (height >> 2); loop_cnt--;) {
1433        LD_SB4(src, src_stride, src7, src8, src9, src10);
1434        src += (4 * src_stride);
1435        XORI_B4_128_SB(src7, src8, src9, src10);
1436        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1437                   src76_r, src87_r, src98_r, src109_r);
1438        dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1439                                 filt1, filt2, filt3);
1440        dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1441                                 filt1, filt2, filt3);
1442        dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1443                                 filt1, filt2, filt3);
1444        dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1445                                 filt1, filt2, filt3);
1446
1447        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1448                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1449                                       dst3);
1450
1451        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1452        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1453        dst += (4 * dst_stride);
1454
1455        src10_r = src54_r;
1456        src32_r = src76_r;
1457        src54_r = src98_r;
1458        src21_r = src65_r;
1459        src43_r = src87_r;
1460        src65_r = src109_r;
1461        src6 = src10;
1462    }
1463}
1464
1465static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
1466                                      int32_t src_stride,
1467                                      uint8_t *dst,
1468                                      int32_t dst_stride,
1469                                      const int8_t *filter,
1470                                      int32_t height,
1471                                      int32_t weight,
1472                                      int32_t offset,
1473                                      int32_t rnd_val)
1474{
1475    int32_t loop_cnt;
1476    v16u8 out0, out1, out2;
1477    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1478    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1479    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1480    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1481    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1482    v16i8 src2110, src4332, src6554, src8776, src10998;
1483    v8i16 filt0, filt1, filt2, filt3;
1484    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1485    v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
1486    v4i32 weight_vec, rnd_vec;
1487
1488    src -= (3 * src_stride);
1489
1490    weight = weight & 0x0000FFFF;
1491    weight_vec = __msa_fill_w(weight);
1492    rnd_vec = __msa_fill_w(rnd_val);
1493
1494    weight *= 128;
1495    rnd_val -= 6;
1496
1497    weight_vec_h = __msa_fill_h(weight);
1498    offset_vec = __msa_fill_h(offset);
1499    denom_vec = __msa_fill_h(rnd_val);
1500
1501    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1502    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1503
1504    filter_vec = LD_SH(filter);
1505    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1506
1507    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1508    src += (7 * src_stride);
1509    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1510
1511    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1512               src10_r, src32_r, src54_r, src21_r);
1513    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1514    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1515               src10_l, src32_l, src54_l, src21_l);
1516    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1517    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1518               src2110, src4332, src6554);
1519
1520    for (loop_cnt = 4; loop_cnt--;) {
1521        LD_SB4(src, src_stride, src7, src8, src9, src10);
1522        src += (4 * src_stride);
1523        XORI_B4_128_SB(src7, src8, src9, src10);
1524
1525        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1526                   src76_r, src87_r, src98_r, src109_r);
1527        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1528                   src76_l, src87_l, src98_l, src109_l);
1529        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1530
1531        dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1532                                 filt1, filt2, filt3);
1533        dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1534                                 filt1, filt2, filt3);
1535        dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1536                                 filt1, filt2, filt3);
1537        dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1538                                 filt1, filt2, filt3);
1539        dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1540                                 filt1, filt2, filt3);
1541        dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1542                                 filt1, filt2, filt3);
1543
1544        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1545                                       offset_vec, rnd_vec, dst0, dst1, dst2,
1546                                       dst3);
1547        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1548                                       rnd_vec, dst4, dst5);
1549
1550        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1551        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1552        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
1553        dst += (4 * dst_stride);
1554
1555        src10_r = src54_r;
1556        src32_r = src76_r;
1557        src54_r = src98_r;
1558        src21_r = src65_r;
1559        src43_r = src87_r;
1560        src65_r = src109_r;
1561        src2110 = src6554;
1562        src4332 = src8776;
1563        src6554 = src10998;
1564        src6 = src10;
1565    }
1566}
1567
1568static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src,
1569                                               int32_t src_stride,
1570                                               uint8_t *dst,
1571                                               int32_t dst_stride,
1572                                               const int8_t *filter,
1573                                               int32_t height,
1574                                               int32_t weight,
1575                                               int32_t offset,
1576                                               int32_t rnd_val,
1577                                               int32_t weightmul16)
1578{
1579    uint8_t *src_tmp;
1580    uint8_t *dst_tmp;
1581    int32_t loop_cnt, cnt;
1582    v16u8 out0, out1, out2, out3;
1583    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1584    v16i8 src10_r, src32_r, src54_r, src76_r;
1585    v16i8 src21_r, src43_r, src65_r, src87_r;
1586    v16i8 src10_l, src32_l, src54_l, src76_l;
1587    v16i8 src21_l, src43_l, src65_l, src87_l;
1588    v16i8 src98_r, src109_r, src98_l, src109_l;
1589    v8i16 filt0, filt1, filt2, filt3;
1590    v8i16 filter_vec;
1591    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1592    v8i16 weight_vec_h, offset_vec, denom_vec;
1593    v4i32 weight_vec, rnd_vec;
1594
1595    src -= (3 * src_stride);
1596
1597    weight_vec = __msa_fill_w(weight);
1598    rnd_vec = __msa_fill_w(rnd_val);
1599
1600    weight *= 128;
1601    rnd_val -= 6;
1602
1603    weight_vec_h = __msa_fill_h(weight);
1604    offset_vec = __msa_fill_h(offset);
1605    denom_vec = __msa_fill_h(rnd_val);
1606
1607    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1608    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1609
1610    filter_vec = LD_SH(filter);
1611    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1612
1613    for (cnt = weightmul16; cnt--;) {
1614        src_tmp = src;
1615        dst_tmp = dst;
1616
1617        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1618        src_tmp += (7 * src_stride);
1619        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1620
1621        for (loop_cnt = (height >> 2); loop_cnt--;) {
1622            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1623            src_tmp += (4 * src_stride);
1624            XORI_B4_128_SB(src7, src8, src9, src10);
1625
1626            ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1627                       src10_r, src32_r, src54_r, src21_r);
1628            ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1629            ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1630                       src10_l, src32_l, src54_l, src21_l);
1631            ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1632            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1633                       src76_r, src87_r, src98_r, src109_r);
1634            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1635                       src76_l, src87_l, src98_l, src109_l);
1636
1637            dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1638                                     filt1, filt2, filt3);
1639            dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1640                                     filt1, filt2, filt3);
1641            dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1642                                     filt1, filt2, filt3);
1643            dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1644                                     filt1, filt2, filt3);
1645            dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1646                                     filt1, filt2, filt3);
1647            dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1648                                     filt1, filt2, filt3);
1649            dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1650                                     filt1, filt2, filt3);
1651            dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1652                                     filt1, filt2, filt3);
1653
1654            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1655                                           offset_vec, rnd_vec, dst0, dst1,
1656                                           dst2, dst3);
1657            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1658                                           offset_vec, rnd_vec, dst4, dst5,
1659                                           dst6, dst7);
1660            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1661            PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1662            ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
1663            dst_tmp += (4 * dst_stride);
1664
1665            src0 = src4;
1666            src1 = src5;
1667            src2 = src6;
1668            src3 = src7;
1669            src4 = src8;
1670            src5 = src9;
1671            src6 = src10;
1672        }
1673
1674        src += 16;
1675        dst += 16;
1676    }
1677}
1678
1679static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src,
1680                                      int32_t src_stride,
1681                                      uint8_t *dst,
1682                                      int32_t dst_stride,
1683                                      const int8_t *filter,
1684                                      int32_t height,
1685                                      int32_t weight,
1686                                      int32_t offset,
1687                                      int32_t rnd_val)
1688{
1689    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1690                                       filter, height, weight,
1691                                       offset, rnd_val, 1);
1692}
1693
1694static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src,
1695                                      int32_t src_stride,
1696                                      uint8_t *dst,
1697                                      int32_t dst_stride,
1698                                      const int8_t *filter,
1699                                      int32_t height,
1700                                      int32_t weight,
1701                                      int32_t offset,
1702                                      int32_t rnd_val)
1703{
1704    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1705                                       filter, 32, weight,
1706                                       offset, rnd_val, 1);
1707
1708    hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1709                             filter, 32, weight, offset, rnd_val);
1710}
1711
1712static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src,
1713                                      int32_t src_stride,
1714                                      uint8_t *dst,
1715                                      int32_t dst_stride,
1716                                      const int8_t *filter,
1717                                      int32_t height,
1718                                      int32_t weight,
1719                                      int32_t offset,
1720                                      int32_t rnd_val)
1721{
1722    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1723                                       filter, height, weight,
1724                                       offset, rnd_val, 2);
1725}
1726
1727static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src,
1728                                      int32_t src_stride,
1729                                      uint8_t *dst,
1730                                      int32_t dst_stride,
1731                                      const int8_t *filter,
1732                                      int32_t height,
1733                                      int32_t weight,
1734                                      int32_t offset,
1735                                      int32_t rnd_val)
1736{
1737    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1738                                       filter, 64, weight,
1739                                       offset, rnd_val, 3);
1740}
1741
1742static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src,
1743                                      int32_t src_stride,
1744                                      uint8_t *dst,
1745                                      int32_t dst_stride,
1746                                      const int8_t *filter,
1747                                      int32_t height,
1748                                      int32_t weight,
1749                                      int32_t offset,
1750                                      int32_t rnd_val)
1751{
1752    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1753                                       filter, height, weight,
1754                                       offset, rnd_val, 4);
1755}
1756
1757static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
1758                                     int32_t src_stride,
1759                                     uint8_t *dst,
1760                                     int32_t dst_stride,
1761                                     const int8_t *filter_x,
1762                                     const int8_t *filter_y,
1763                                     int32_t height,
1764                                     int32_t weight,
1765                                     int32_t offset,
1766                                     int32_t rnd_val)
1767{
1768    uint32_t loop_cnt;
1769    v16u8 out;
1770    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1771    v8i16 filt0, filt1, filt2, filt3;
1772    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1773    v16i8 mask1, mask2, mask3;
1774    v8i16 filter_vec;
1775    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1776    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1777    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1778    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1779    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1780    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1781    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1782    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1783
1784    src -= ((3 * src_stride) + 3);
1785    filter_vec = LD_SH(filter_x);
1786    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1787
1788    filter_vec = LD_SH(filter_y);
1789    UNPCK_R_SB_SH(filter_vec, filter_vec);
1790
1791    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1792
1793    mask1 = mask0 + 2;
1794    mask2 = mask0 + 4;
1795    mask3 = mask0 + 6;
1796
1797    weight_vec = __msa_fill_w(weight);
1798    offset_vec = __msa_fill_w(offset);
1799    rnd_vec = __msa_fill_w(rnd_val);
1800    denom_vec = rnd_vec - 6;
1801
1802    const_128 = __msa_ldi_w(128);
1803    const_128 *= weight_vec;
1804    offset_vec += __msa_srar_w(const_128, denom_vec);
1805
1806    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1807    src += (7 * src_stride);
1808    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1809
1810    /* row 0 row 1 row 2 row 3 */
1811    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1812    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1813    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1814               vec8, vec9, vec10, vec11);
1815    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1816               vec12, vec13, vec14, vec15);
1817    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1818                              filt3);
1819    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1820                              filt3);
1821    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1822                              filt3);
1823    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1824                              filt3);
1825
1826    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1827    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1828    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1829
1830    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1831
1832    for (loop_cnt = height >> 2; loop_cnt--;) {
1833        LD_SB4(src, src_stride, src7, src8, src9, src10);
1834        src += (4 * src_stride);
1835        XORI_B4_128_SB(src7, src8, src9, src10);
1836
1837        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1838                   vec0, vec1, vec2, vec3);
1839        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1840                   vec4, vec5, vec6, vec7);
1841        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1842                                  filt3);
1843        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1844                                   filt3);
1845
1846        dst76_r = __msa_ilvr_h(dst97, dst66);
1847        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1848        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1849        dst98_r = __msa_ilvr_h(dst66, dst108);
1850
1851        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1852                                filt_h1, filt_h2, filt_h3);
1853        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1854                                filt_h1, filt_h2, filt_h3);
1855        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1856                                filt_h1, filt_h2, filt_h3);
1857        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1858                                filt_h1, filt_h2, filt_h3);
1859
1860        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1861        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1862        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
1863        SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
1864        ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1865        ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
1866        CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
1867        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1868        out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
1869        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1870        dst += (4 * dst_stride);
1871
1872        dst10_r = dst54_r;
1873        dst32_r = dst76_r;
1874        dst54_r = dst98_r;
1875        dst21_r = dst65_r;
1876        dst43_r = dst87_r;
1877        dst65_r = dst109_r;
1878        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1879    }
1880}
1881
1882static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
1883                                              int32_t src_stride,
1884                                              uint8_t *dst,
1885                                              int32_t dst_stride,
1886                                              const int8_t *filter_x,
1887                                              const int8_t *filter_y,
1888                                              int32_t height,
1889                                              int32_t weight,
1890                                              int32_t offset,
1891                                              int32_t rnd_val,
1892                                              int32_t width)
1893{
1894    uint32_t loop_cnt, cnt;
1895    uint8_t *src_tmp;
1896    uint8_t *dst_tmp;
1897    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1898    v8i16 filt0, filt1, filt2, filt3;
1899    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1900    v16i8 mask1, mask2, mask3;
1901    v8i16 filter_vec;
1902    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1903    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1904    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1905    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1906    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1907    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1908    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1909    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1910    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1911    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1912
1913    src -= ((3 * src_stride) + 3);
1914
1915    weight_vec = __msa_fill_w(weight);
1916    offset_vec = __msa_fill_w(offset);
1917    rnd_vec = __msa_fill_w(rnd_val);
1918    denom_vec = rnd_vec - 6;
1919
1920    const_128 = __msa_ldi_w(128);
1921    const_128 *= weight_vec;
1922    offset_vec += __msa_srar_w(const_128, denom_vec);
1923
1924    filter_vec = LD_SH(filter_x);
1925    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1926
1927    filter_vec = LD_SH(filter_y);
1928    UNPCK_R_SB_SH(filter_vec, filter_vec);
1929    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1930
1931    mask1 = mask0 + 2;
1932    mask2 = mask0 + 4;
1933    mask3 = mask0 + 6;
1934
1935    for (cnt = width >> 3; cnt--;) {
1936        src_tmp = src;
1937        dst_tmp = dst;
1938
1939        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1940        src_tmp += (7 * src_stride);
1941        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1942
1943        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1944                   vec0, vec1, vec2, vec3);
1945        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1946                   vec4, vec5, vec6, vec7);
1947        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1948                   vec8, vec9, vec10, vec11);
1949        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1950                   vec12, vec13, vec14, vec15);
1951        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1952                                 filt3);
1953        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1954                                 filt3);
1955        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1956                                 filt3);
1957        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1958                                 filt2, filt3);
1959
1960        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1961                   vec0, vec1, vec2, vec3);
1962        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1963                   vec4, vec5, vec6, vec7);
1964        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1965                   vec8, vec9, vec10, vec11);
1966        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1967                                 filt3);
1968        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1969                                 filt3);
1970        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1971                                 filt3);
1972
1973        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1974                   dst10_r, dst32_r, dst54_r, dst21_r);
1975        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1976        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1977                   dst10_l, dst32_l, dst54_l, dst21_l);
1978        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1979
1980        for (loop_cnt = height >> 1; loop_cnt--;) {
1981            LD_SB2(src_tmp, src_stride, src7, src8);
1982            src_tmp += 2 * src_stride;
1983            XORI_B2_128_SB(src7, src8);
1984
1985            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1986                       vec0, vec1, vec2, vec3);
1987            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1988                                     filt2, filt3);
1989
1990            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1991            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1992                                    filt_h0, filt_h1, filt_h2, filt_h3);
1993            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1994                                    filt_h0, filt_h1, filt_h2, filt_h3);
1995            dst0_r >>= 6;
1996            dst0_l >>= 6;
1997
1998            /* row 8 */
1999            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2000                       vec0, vec1, vec2, vec3);
2001            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2002                                     filt2, filt3);
2003
2004            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2005            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2006                                    filt_h0, filt_h1, filt_h2, filt_h3);
2007            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2008                                    filt_h0, filt_h1, filt_h2, filt_h3);
2009            dst1_r >>= 6;
2010            dst1_l >>= 6;
2011
2012            MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2013            MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
2014            SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
2015            ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2016            ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
2017            CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l);
2018
2019            PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2020            dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2021            ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride);
2022            dst_tmp += (2 * dst_stride);
2023
2024            dst10_r = dst32_r;
2025            dst32_r = dst54_r;
2026            dst54_r = dst76_r;
2027            dst10_l = dst32_l;
2028            dst32_l = dst54_l;
2029            dst54_l = dst76_l;
2030            dst21_r = dst43_r;
2031            dst43_r = dst65_r;
2032            dst65_r = dst87_r;
2033            dst21_l = dst43_l;
2034            dst43_l = dst65_l;
2035            dst65_l = dst87_l;
2036            dst6 = dst8;
2037        }
2038
2039        src += 8;
2040        dst += 8;
2041    }
2042}
2043
2044static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src,
2045                                     int32_t src_stride,
2046                                     uint8_t *dst,
2047                                     int32_t dst_stride,
2048                                     const int8_t *filter_x,
2049                                     const int8_t *filter_y,
2050                                     int32_t height,
2051                                     int32_t weight,
2052                                     int32_t offset,
2053                                     int32_t rnd_val)
2054{
2055    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2056                                      filter_x, filter_y, height, weight,
2057                                      offset, rnd_val, 8);
2058}
2059
2060static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
2061                                      int32_t src_stride,
2062                                      uint8_t *dst,
2063                                      int32_t dst_stride,
2064                                      const int8_t *filter_x,
2065                                      const int8_t *filter_y,
2066                                      int32_t height,
2067                                      int32_t weight,
2068                                      int32_t offset,
2069                                      int32_t rnd_val)
2070{
2071    uint32_t loop_cnt;
2072    uint8_t *src_tmp, *dst_tmp;
2073    v16u8 out;
2074    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2075    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2076    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2077    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2078    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2079    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
2080    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2081    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
2082    v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2083    v8i16 dst76_l, filter_vec;
2084    v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
2085    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
2086
2087    src -= ((3 * src_stride) + 3);
2088
2089    filter_vec = LD_SH(filter_x);
2090    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2091
2092    filter_vec = LD_SH(filter_y);
2093    UNPCK_R_SB_SH(filter_vec, filter_vec);
2094
2095    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2096
2097    weight_vec = __msa_fill_w(weight);
2098    offset_vec = __msa_fill_w(offset);
2099    rnd_vec = __msa_fill_w(rnd_val);
2100    denom_vec = rnd_vec - 6;
2101
2102    const_128 = __msa_ldi_w(128);
2103    const_128 *= weight_vec;
2104    offset_vec += __msa_srar_w(const_128, denom_vec);
2105
2106    mask0 = LD_SB(ff_hevc_mask_arr);
2107    mask1 = mask0 + 2;
2108    mask2 = mask0 + 4;
2109    mask3 = mask0 + 6;
2110
2111    src_tmp = src;
2112    dst_tmp = dst;
2113
2114    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2115    src_tmp += (7 * src_stride);
2116    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2117
2118    /* row 0 row 1 row 2 row 3 */
2119    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2120    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2121    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2122               vec11);
2123    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2124               vec15);
2125    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2126                             filt3);
2127    dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2128                             filt3);
2129    dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2130                             filt3);
2131    dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2132                             filt2, filt3);
2133    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2134    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2135    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2136               vec11);
2137    dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2138                             filt3);
2139    dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2140                             filt3);
2141    dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2142                             filt3);
2143
2144    for (loop_cnt = 16; loop_cnt--;) {
2145        src7 = LD_SB(src_tmp);
2146        src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
2147        src_tmp += src_stride;
2148
2149        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2150                   vec3);
2151        dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2152                                 filt3);
2153        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
2154        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
2155        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
2156        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2157
2158        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2159                                filt_h0, filt_h1, filt_h2, filt_h3);
2160        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2161                                filt_h0, filt_h1, filt_h2, filt_h3);
2162        dst0_r >>= 6;
2163        dst0_l >>= 6;
2164
2165        MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2166        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2167        ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2168        CLIP_SW2_0_255(dst0_r, dst0_l);
2169        dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2170        out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2171        ST_D1(out, 0, dst_tmp);
2172        dst_tmp += dst_stride;
2173
2174        dst0 = dst1;
2175        dst1 = dst2;
2176        dst2 = dst3;
2177        dst3 = dst4;
2178        dst4 = dst5;
2179        dst5 = dst6;
2180        dst6 = dst7;
2181    }
2182
2183    src += 8;
2184    dst += 8;
2185
2186    mask4 = LD_SB(ff_hevc_mask_arr + 16);
2187    mask5 = mask4 + 2;
2188    mask6 = mask4 + 4;
2189    mask7 = mask4 + 6;
2190
2191    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2192    src += (7 * src_stride);
2193    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2194
2195    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2196    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2197    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2198               vec11);
2199    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2200               vec15);
2201    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2202                              filt3);
2203    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2204                              filt3);
2205    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2206                              filt3);
2207    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2208                              filt3);
2209    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
2210    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
2211    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
2212
2213    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2214
2215    for (loop_cnt = 4; loop_cnt--;) {
2216        LD_SB4(src, src_stride, src7, src8, src9, src10);
2217        src += (4 * src_stride);
2218        XORI_B4_128_SB(src7, src8, src9, src10);
2219
2220        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2221                   vec3);
2222        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2223                   vec7);
2224        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2225                                  filt3);
2226        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2227                                   filt3);
2228
2229        dst76_r = __msa_ilvr_h(dst97, dst66);
2230        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
2231        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2232        dst98_r = __msa_ilvr_h(dst66, dst108);
2233
2234        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2235                                filt_h1, filt_h2, filt_h3);
2236        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2237                                filt_h1, filt_h2, filt_h3);
2238        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2239                                filt_h1, filt_h2, filt_h3);
2240        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2241                                filt_h1, filt_h2, filt_h3);
2242
2243        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2244        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2245        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2246        SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2247        ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2248        ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2249        CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
2250        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2251        out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2252        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2253        dst += (4 * dst_stride);
2254
2255        dst10_r = dst54_r;
2256        dst32_r = dst76_r;
2257        dst54_r = dst98_r;
2258        dst21_r = dst65_r;
2259        dst43_r = dst87_r;
2260        dst65_r = dst109_r;
2261        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2262    }
2263}
2264
2265static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src,
2266                                      int32_t src_stride,
2267                                      uint8_t *dst,
2268                                      int32_t dst_stride,
2269                                      const int8_t *filter_x,
2270                                      const int8_t *filter_y,
2271                                      int32_t height,
2272                                      int32_t weight,
2273                                      int32_t offset,
2274                                      int32_t rnd_val)
2275{
2276    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2277                                      filter_x, filter_y, height, weight,
2278                                      offset, rnd_val, 16);
2279}
2280
2281static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src,
2282                                      int32_t src_stride,
2283                                      uint8_t *dst,
2284                                      int32_t dst_stride,
2285                                      const int8_t *filter_x,
2286                                      const int8_t *filter_y,
2287                                      int32_t height,
2288                                      int32_t weight,
2289                                      int32_t offset,
2290                                      int32_t rnd_val)
2291{
2292    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2293                                      filter_x, filter_y, height, weight,
2294                                      offset, rnd_val, 24);
2295}
2296
2297static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src,
2298                                      int32_t src_stride,
2299                                      uint8_t *dst,
2300                                      int32_t dst_stride,
2301                                      const int8_t *filter_x,
2302                                      const int8_t *filter_y,
2303                                      int32_t height,
2304                                      int32_t weight,
2305                                      int32_t offset,
2306                                      int32_t rnd_val)
2307{
2308    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2309                                      filter_x, filter_y, height, weight,
2310                                      offset, rnd_val, 32);
2311}
2312
2313static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src,
2314                                      int32_t src_stride,
2315                                      uint8_t *dst,
2316                                      int32_t dst_stride,
2317                                      const int8_t *filter_x,
2318                                      const int8_t *filter_y,
2319                                      int32_t height,
2320                                      int32_t weight,
2321                                      int32_t offset,
2322                                      int32_t rnd_val)
2323{
2324    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2325                                      filter_x, filter_y, height, weight,
2326                                      offset, rnd_val, 48);
2327}
2328
2329static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src,
2330                                      int32_t src_stride,
2331                                      uint8_t *dst,
2332                                      int32_t dst_stride,
2333                                      const int8_t *filter_x,
2334                                      const int8_t *filter_y,
2335                                      int32_t height,
2336                                      int32_t weight,
2337                                      int32_t offset,
2338                                      int32_t rnd_val)
2339{
2340    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2341                                      filter_x, filter_y, height, weight,
2342                                      offset, rnd_val, 64);
2343}
2344
2345static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
2346                                      int32_t src_stride,
2347                                      uint8_t *dst,
2348                                      int32_t dst_stride,
2349                                      const int8_t *filter,
2350                                      int32_t weight,
2351                                      int32_t offset,
2352                                      int32_t rnd_val)
2353{
2354    v16u8 out;
2355    v8i16 filt0, filt1;
2356    v16i8 src0, src1, vec0, vec1;
2357    v16i8 mask1;
2358    v8i16 dst0;
2359    v4i32 dst0_r, dst0_l;
2360    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2361    v4i32 weight_vec, rnd_vec;
2362    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2363
2364    src -= 1;
2365
2366    filter_vec = LD_SH(filter);
2367    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2368
2369    mask1 = mask0 + 2;
2370
2371    weight = weight & 0x0000FFFF;
2372
2373    weight_vec = __msa_fill_w(weight);
2374    rnd_vec = __msa_fill_w(rnd_val);
2375
2376    weight *= 128;
2377    rnd_val -= 6;
2378
2379    weight_vec_h = __msa_fill_h(weight);
2380    offset_vec = __msa_fill_h(offset);
2381    denom_vec = __msa_fill_h(rnd_val);
2382
2383    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2384    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2385
2386    LD_SB2(src, src_stride, src0, src1);
2387    XORI_B2_128_SB(src0, src1);
2388
2389    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2390    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2391
2392    ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
2393    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2394    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2395    dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2396    dst0 = __msa_adds_s_h(dst0, offset_vec);
2397    CLIP_SH_0_255(dst0);
2398    out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
2399    ST_W2(out, 0, 1, dst, dst_stride);
2400    dst += (4 * dst_stride);
2401}
2402
2403static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
2404                                      int32_t src_stride,
2405                                      uint8_t *dst,
2406                                      int32_t dst_stride,
2407                                      const int8_t *filter,
2408                                      int32_t weight,
2409                                      int32_t offset,
2410                                      int32_t rnd_val)
2411{
2412    v16u8 out;
2413    v8i16 filt0, filt1;
2414    v16i8 src0, src1, src2, src3;
2415    v16i8 mask1, vec0, vec1, vec2, vec3;
2416    v8i16 dst0, dst1;
2417    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2418    v4i32 weight_vec, rnd_vec;
2419    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2420
2421    src -= 1;
2422
2423    /* rearranging filter */
2424    filter_vec = LD_SH(filter);
2425    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2426
2427    mask1 = mask0 + 2;
2428
2429    weight = weight & 0x0000FFFF;
2430
2431    weight_vec = __msa_fill_w(weight);
2432    rnd_vec = __msa_fill_w(rnd_val);
2433
2434    weight *= 128;
2435    rnd_val -= 6;
2436
2437    weight_vec_h = __msa_fill_h(weight);
2438    offset_vec = __msa_fill_h(offset);
2439    denom_vec = __msa_fill_h(rnd_val);
2440
2441    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2442    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2443
2444    LD_SB4(src, src_stride, src0, src1, src2, src3);
2445    XORI_B4_128_SB(src0, src1, src2, src3);
2446
2447    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2448    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2449    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2450    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2451
2452    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2453                                   dst0, dst1);
2454
2455    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2457    dst += (4 * dst_stride);
2458}
2459
2460static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
2461                                              int32_t src_stride,
2462                                              uint8_t *dst,
2463                                              int32_t dst_stride,
2464                                              const int8_t *filter,
2465                                              int32_t height,
2466                                              int32_t weight,
2467                                              int32_t offset,
2468                                              int32_t rnd_val)
2469{
2470    uint32_t loop_cnt;
2471    v16u8 out0, out1;
2472    v8i16 filt0, filt1;
2473    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2474    v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2475    v8i16 dst0, dst1, dst2, dst3;
2476    v8i16 filter_vec;
2477    v8i16 weight_vec_h, offset_vec, denom_vec;
2478    v4i32 weight_vec, rnd_vec;
2479    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2480
2481    src -= 1;
2482
2483    filter_vec = LD_SH(filter);
2484    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2485
2486    weight = weight & 0x0000FFFF;
2487
2488    weight_vec = __msa_fill_w(weight);
2489    rnd_vec = __msa_fill_w(rnd_val);
2490
2491    weight *= 128;
2492    rnd_val -= 6;
2493
2494    weight_vec_h = __msa_fill_h(weight);
2495    offset_vec = __msa_fill_h(offset);
2496    denom_vec = __msa_fill_h(rnd_val);
2497
2498    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2499    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2500
2501    mask1 = mask0 + 2;
2502
2503    for (loop_cnt = (height >> 3); loop_cnt--;) {
2504        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2505        src += (8 * src_stride);
2506
2507        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2508
2509        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2510        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2511        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
2512        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
2513        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2514        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2515        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2516        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2517
2518        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2519                                       weight_vec, offset_vec, rnd_vec,
2520                                       dst0, dst1, dst2, dst3);
2521
2522        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2523        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2524        dst += (8 * dst_stride);
2525    }
2526}
2527
2528static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src,
2529                                     int32_t src_stride,
2530                                     uint8_t *dst,
2531                                     int32_t dst_stride,
2532                                     const int8_t *filter,
2533                                     int32_t height,
2534                                     int32_t weight,
2535                                     int32_t offset,
2536                                     int32_t rnd_val)
2537{
2538    if (2 == height) {
2539        hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
2540                                  filter, weight, offset, rnd_val);
2541    } else if (4 == height) {
2542        hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
2543                                  filter, weight, offset, rnd_val);
2544    } else if (8 == height || 16 == height) {
2545        hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2546                                          filter, height, weight,
2547                                          offset, rnd_val);
2548    }
2549}
2550
2551static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
2552                                     int32_t src_stride,
2553                                     uint8_t *dst,
2554                                     int32_t dst_stride,
2555                                     const int8_t *filter,
2556                                     int32_t height,
2557                                     int32_t weight,
2558                                     int32_t offset,
2559                                     int32_t rnd_val)
2560{
2561    v16u8 out0, out1, out2, out3;
2562    v8i16 filt0, filt1;
2563    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2564    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2565    v16i8 mask1;
2566    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2567    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2568    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2569    v4i32 weight_vec, rnd_vec;
2570
2571    src -= 1;
2572
2573    filter_vec = LD_SH(filter);
2574    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2575
2576    weight = weight & 0x0000FFFF;
2577
2578    weight_vec = __msa_fill_w(weight);
2579    rnd_vec = __msa_fill_w(rnd_val);
2580
2581    weight *= 128;
2582    rnd_val -= 6;
2583
2584    weight_vec_h = __msa_fill_h(weight);
2585    offset_vec = __msa_fill_h(offset);
2586    denom_vec = __msa_fill_h(rnd_val);
2587
2588    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2589    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2590
2591    mask1 = mask0 + 2;
2592
2593    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2594    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2595    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2596    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2597    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2598    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2599    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2600    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2601    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2602    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2603    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2604    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2605    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2606    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2607    dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2608    dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2609    dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2610    dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2611
2612    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2613                                   weight_vec, offset_vec, rnd_vec,
2614                                   dst0, dst1, dst2, dst3);
2615    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
2616                                   weight_vec, offset_vec, rnd_vec,
2617                                   dst4, dst5, dst6, dst7);
2618
2619    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2620    PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
2621    ST_W2(out0, 0, 2, dst, dst_stride);
2622    ST_H2(out0, 2, 6, dst + 4, dst_stride);
2623    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2624    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2625    dst += (4 * dst_stride);
2626    ST_W2(out2, 0, 2, dst, dst_stride);
2627    ST_H2(out2, 2, 6, dst + 4, dst_stride);
2628    ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
2629    ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2630}
2631
2632static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
2633                                      int32_t src_stride,
2634                                      uint8_t *dst,
2635                                      int32_t dst_stride,
2636                                      const int8_t *filter,
2637                                      int32_t weight,
2638                                      int32_t offset,
2639                                      int32_t rnd_val)
2640{
2641    v16u8 out;
2642    v8i16 filt0, filt1, dst0, dst1;
2643    v16i8 src0, src1;
2644    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2645    v16i8 mask1;
2646    v16i8 vec0, vec1, vec2, vec3;
2647    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2648    v4i32 weight_vec, rnd_vec;
2649
2650    src -= 1;
2651
2652    filter_vec = LD_SH(filter);
2653    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2654
2655    weight = weight & 0x0000FFFF;
2656
2657    weight_vec = __msa_fill_w(weight);
2658    rnd_vec = __msa_fill_w(rnd_val);
2659
2660    weight *= 128;
2661    rnd_val -= 6;
2662
2663    weight_vec_h = __msa_fill_h(weight);
2664    offset_vec = __msa_fill_h(offset);
2665    denom_vec = __msa_fill_h(rnd_val);
2666
2667    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2668    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2669
2670    mask1 = mask0 + 2;
2671
2672    LD_SB2(src, src_stride, src0, src1);
2673    XORI_B2_128_SB(src0, src1);
2674
2675    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2676    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2677    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2678    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2679
2680    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2681                                   dst0, dst1);
2682
2683    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2684    ST_D2(out, 0, 1, dst, dst_stride);
2685}
2686
2687static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src,
2688                                      int32_t src_stride,
2689                                      uint8_t *dst,
2690                                      int32_t dst_stride,
2691                                      const int8_t *filter,
2692                                      int32_t weight,
2693                                      int32_t offset,
2694                                      int32_t rnd_val)
2695{
2696    v16u8 out0, out1;
2697    v16i8 src0, src1, src2, src3;
2698    v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2699    v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
2700    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2701    v4i32 weight_vec, rnd_vec;
2702
2703    src -= 1;
2704
2705    filter_vec = LD_SH(filter);
2706    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2707
2708    weight = weight & 0x0000FFFF;
2709    weight_vec = __msa_fill_w(weight);
2710    rnd_vec = __msa_fill_w(rnd_val);
2711
2712    weight *= 128;
2713    rnd_val -= 6;
2714
2715    weight_vec_h = __msa_fill_h(weight);
2716    offset_vec = __msa_fill_h(offset);
2717    denom_vec = __msa_fill_h(rnd_val);
2718
2719    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2720    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2721
2722    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2723    mask1 = mask0 + 2;
2724
2725    LD_SB4(src, src_stride, src0, src1, src2, src3);
2726    XORI_B4_128_SB(src0, src1, src2, src3);
2727    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2728    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2729    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2730    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2731    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2732    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2733    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2734    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2735
2736    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2737                                   weight_vec, offset_vec, rnd_vec,
2738                                   dst0, dst1, dst2, dst3);
2739
2740    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2741    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2742}
2743
2744static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
2745                                      int32_t src_stride,
2746                                      uint8_t *dst,
2747                                      int32_t dst_stride,
2748                                      const int8_t *filter,
2749                                      int32_t weight,
2750                                      int32_t offset,
2751                                      int32_t rnd_val)
2752{
2753    v16u8 out0, out1, out2;
2754    v8i16 filt0, filt1;
2755    v16i8 src0, src1, src2, src3, src4, src5;
2756    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2757    v16i8 mask1;
2758    v16i8 vec11;
2759    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2760    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2761    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2762    v4i32 weight_vec, rnd_vec;
2763
2764    src -= 1;
2765
2766    filter_vec = LD_SH(filter);
2767    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2768
2769    weight = weight & 0x0000FFFF;
2770
2771    weight_vec = __msa_fill_w(weight);
2772    rnd_vec = __msa_fill_w(rnd_val);
2773
2774    weight *= 128;
2775    rnd_val -= 6;
2776
2777    weight_vec_h = __msa_fill_h(weight);
2778    offset_vec = __msa_fill_h(offset);
2779    denom_vec = __msa_fill_h(rnd_val);
2780
2781    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2782    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2783
2784    mask1 = mask0 + 2;
2785
2786    LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2787    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2788
2789    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2790    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2791    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2792    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2793    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
2794    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
2795    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2796    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2797    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2798    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2799    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
2800    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
2801
2802    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2803                                   weight_vec, offset_vec, rnd_vec,
2804                                   dst0, dst1, dst2, dst3);
2805
2806    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
2807                                   dst4, dst5);
2808
2809    PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2810    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2811    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
2812}
2813
2814static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src,
2815                                              int32_t src_stride,
2816                                              uint8_t *dst,
2817                                              int32_t dst_stride,
2818                                              const int8_t *filter,
2819                                              int32_t height,
2820                                              int32_t weight,
2821                                              int32_t offset,
2822                                              int32_t rnd_val)
2823{
2824    uint32_t loop_cnt;
2825    v8i16 filt0, filt1;
2826    v16u8 out0, out1, out2, out3;
2827    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2828    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2829    v16i8 mask1;
2830    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2831    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2832    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2833    v4i32 weight_vec, rnd_vec;
2834
2835    src -= 1;
2836
2837    filter_vec = LD_SH(filter);
2838    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2839
2840    weight = weight & 0x0000FFFF;
2841
2842    weight_vec = __msa_fill_w(weight);
2843    rnd_vec = __msa_fill_w(rnd_val);
2844
2845    weight *= 128;
2846    rnd_val -= 6;
2847
2848    weight_vec_h = __msa_fill_h(weight);
2849    offset_vec = __msa_fill_h(offset);
2850    denom_vec = __msa_fill_h(rnd_val);
2851
2852    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2853    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2854
2855    mask1 = mask0 + 2;
2856
2857    for (loop_cnt = (height >> 3); loop_cnt--;) {
2858        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2859        src += (8 * src_stride);
2860        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2861
2862        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2863        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2864        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2865        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2866        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2867        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2868        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2869        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2870        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2871        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2872        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2873        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2874        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2875        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2876        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2877        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2878
2879        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2880                                       weight_vec, offset_vec, rnd_vec,
2881                                       dst0, dst1, dst2, dst3);
2882
2883        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
2884                                       weight_vec, offset_vec, rnd_vec,
2885                                       dst4, dst5, dst6, dst7);
2886
2887        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2888        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
2889        ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2890        dst += (8 * dst_stride);
2891    }
2892}
2893
2894static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src,
2895                                     int32_t src_stride,
2896                                     uint8_t *dst,
2897                                     int32_t dst_stride,
2898                                     const int8_t *filter,
2899                                     int32_t height,
2900                                     int32_t weight,
2901                                     int32_t offset,
2902                                     int32_t rnd_val)
2903{
2904    if (2 == height) {
2905        hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
2906                                  filter, weight, offset, rnd_val);
2907    } else if (4 == height) {
2908        hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
2909                                  filter, weight, offset, rnd_val);
2910    } else if (6 == height) {
2911        hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
2912                                  filter, weight, offset, rnd_val);
2913    } else {
2914        hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride,
2915                                          filter, height, weight, offset,
2916                                          rnd_val);
2917    }
2918}
2919
2920static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
2921                                      int32_t src_stride,
2922                                      uint8_t *dst,
2923                                      int32_t dst_stride,
2924                                      const int8_t *filter,
2925                                      int32_t height,
2926                                      int32_t weight,
2927                                      int32_t offset,
2928                                      int32_t rnd_val)
2929{
2930    uint32_t loop_cnt;
2931    v16u8 out0, out1, out2;
2932    v8i16 filt0, filt1;
2933    v16i8 src0, src1, src2, src3;
2934    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2935    v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2936    };
2937    v16i8 mask1;
2938    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2939    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2940    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2941    v16i8 mask3, vec11;
2942    v4i32 weight_vec, rnd_vec;
2943
2944    src -= 1;
2945
2946    filter_vec = LD_SH(filter);
2947    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2948
2949    weight = weight & 0x0000FFFF;
2950
2951    weight_vec = __msa_fill_w(weight);
2952    rnd_vec = __msa_fill_w(rnd_val);
2953
2954    weight *= 128;
2955    rnd_val -= 6;
2956
2957    weight_vec_h = __msa_fill_h(weight);
2958    offset_vec = __msa_fill_h(offset);
2959    denom_vec = __msa_fill_h(rnd_val);
2960
2961    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2962    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2963
2964    mask1 = mask0 + 2;
2965    mask3 = mask2 + 2;
2966
2967    for (loop_cnt = 4; loop_cnt--;) {
2968        LD_SB4(src, src_stride, src0, src1, src2, src3);
2969        src += (4 * src_stride);
2970
2971        XORI_B4_128_SB(src0, src1, src2, src3);
2972
2973        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2974        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2975        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2976        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2977        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
2978        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
2979        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2980        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2981        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2982        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2983        dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
2984        dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
2985
2986        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2987                                       weight_vec, offset_vec, rnd_vec,
2988                                       dst0, dst1, dst2, dst3);
2989
2990        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
2991                                       rnd_vec, dst4, dst5);
2992
2993        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2994        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2995        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
2996        dst += (4 * dst_stride);
2997    }
2998}
2999
3000static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src,
3001                                      int32_t src_stride,
3002                                      uint8_t *dst,
3003                                      int32_t dst_stride,
3004                                      const int8_t *filter,
3005                                      int32_t height,
3006                                      int32_t weight,
3007                                      int32_t offset,
3008                                      int32_t rnd_val)
3009{
3010    uint32_t loop_cnt;
3011    v16u8 out0, out1, out2, out3;
3012    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3013    v8i16 filt0, filt1;
3014    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3015    v16i8 mask1;
3016    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3017    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3018    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3019    v4i32 weight_vec, rnd_vec;
3020
3021    src -= 1;
3022
3023    filter_vec = LD_SH(filter);
3024    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3025
3026    weight = weight & 0x0000FFFF;
3027
3028    weight_vec = __msa_fill_w(weight);
3029    rnd_vec = __msa_fill_w(rnd_val);
3030
3031    weight *= 128;
3032    rnd_val -= 6;
3033
3034    weight_vec_h = __msa_fill_h(weight);
3035    offset_vec = __msa_fill_h(offset);
3036    denom_vec = __msa_fill_h(rnd_val);
3037
3038    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3039    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3040
3041    mask1 = mask0 + 2;
3042
3043    for (loop_cnt = (height >> 2); loop_cnt--;) {
3044        LD_SB4(src, src_stride, src0, src2, src4, src6);
3045        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3046        src += (4 * src_stride);
3047
3048        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3049
3050        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3051        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3052        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3053        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3054        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3055        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3056        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3057        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3058        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3059        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3060        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3061        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3062        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3063        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3064        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3065        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3066
3067        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3068                                       weight_vec, offset_vec, rnd_vec,
3069                                       dst0, dst1, dst2, dst3);
3070
3071        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3072                                       weight_vec, offset_vec, rnd_vec,
3073                                       dst4, dst5, dst6, dst7);
3074
3075        PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3076                    out0, out1, out2, out3);
3077
3078        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3079        dst += (4 * dst_stride);
3080    }
3081}
3082
3083static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
3084                                      int32_t src_stride,
3085                                      uint8_t *dst,
3086                                      int32_t dst_stride,
3087                                      const int8_t *filter,
3088                                      int32_t height,
3089                                      int32_t weight,
3090                                      int32_t offset,
3091                                      int32_t rnd_val)
3092{
3093    uint32_t loop_cnt;
3094    v16u8 out0, out1, out2;
3095    v16i8 src0, src1, src2, src3;
3096    v8i16 filt0, filt1;
3097    v16i8 mask0, mask1, mask2, mask3;
3098    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3099    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3100    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3101    v4i32 weight_vec, rnd_vec;
3102
3103    src -= 1;
3104
3105    filter_vec = LD_SH(filter);
3106    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3107
3108    weight = weight & 0x0000FFFF;
3109    weight_vec = __msa_fill_w(weight);
3110    rnd_vec = __msa_fill_w(rnd_val);
3111
3112    weight *= 128;
3113    rnd_val -= 6;
3114
3115    weight_vec_h = __msa_fill_h(weight);
3116    offset_vec = __msa_fill_h(offset);
3117    denom_vec = __msa_fill_h(rnd_val);
3118
3119    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3120    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3121
3122    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3123    mask1 = mask0 + 2;
3124    mask2 = mask0 + 8;
3125    mask3 = mask0 + 10;
3126
3127    for (loop_cnt = 16; loop_cnt--;) {
3128        LD_SB2(src, src_stride, src0, src2);
3129        LD_SB2(src + 16, src_stride, src1, src3);
3130        src += (2 * src_stride);
3131
3132        XORI_B4_128_SB(src0, src1, src2, src3);
3133
3134        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3135        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3136        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3137        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
3138        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3139        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3140        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3141        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3142        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3143        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
3144        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3145        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3146
3147        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3148                                       weight_vec, offset_vec, rnd_vec,
3149                                       dst0, dst1, dst2, dst3);
3150
3151        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
3152                                       rnd_vec, dst4, dst5);
3153
3154        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3155        ST_UB2(out0, out1, dst, dst_stride);
3156        ST_D2(out2, 0, 1, dst + 16, dst_stride);
3157        dst += (2 * dst_stride);
3158    }
3159}
3160
3161static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src,
3162                                      int32_t src_stride,
3163                                      uint8_t *dst,
3164                                      int32_t dst_stride,
3165                                      const int8_t *filter,
3166                                      int32_t height,
3167                                      int32_t weight,
3168                                      int32_t offset,
3169                                      int32_t rnd_val)
3170{
3171    uint32_t loop_cnt;
3172    v16u8 out0, out1, out2, out3;
3173    v16i8 src0, src1, src2, src3, src4, src5;
3174    v8i16 filt0, filt1;
3175    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3176    v16i8 mask1, mask2, mask3;
3177    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3178    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3179    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3180    v4i32 weight_vec, rnd_vec;
3181
3182    src -= 1;
3183
3184    filter_vec = LD_SH(filter);
3185    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3186
3187    weight = weight & 0x0000FFFF;
3188
3189    weight_vec = __msa_fill_w(weight);
3190    rnd_vec = __msa_fill_w(rnd_val);
3191
3192    weight *= 128;
3193    rnd_val -= 6;
3194
3195    weight_vec_h = __msa_fill_h(weight);
3196    offset_vec = __msa_fill_h(offset);
3197    denom_vec = __msa_fill_h(rnd_val);
3198
3199    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3200    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3201
3202    mask1 = mask0 + 2;
3203    mask2 = mask0 + 8;
3204    mask3 = mask0 + 10;
3205
3206    for (loop_cnt = (height >> 1); loop_cnt--;) {
3207        LD_SB2(src, 16, src0, src1);
3208        src2 = LD_SB(src + 24);
3209        src += src_stride;
3210        LD_SB2(src, 16, src3, src4);
3211        src5 = LD_SB(src + 24);
3212        src += src_stride;
3213        XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3214        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3215        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3216        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
3217        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
3218        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3219        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3220        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3221        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3222        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3223        VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
3224        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
3225        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
3226        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3227        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3228        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3229        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3230
3231        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3232                                       weight_vec, offset_vec, rnd_vec,
3233                                       dst0, dst1, dst2, dst3);
3234
3235        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3236                                       weight_vec, offset_vec, rnd_vec,
3237                                       dst4, dst5, dst6, dst7);
3238
3239        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3240        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3241        ST_UB2(out0, out1, dst, 16);
3242        dst += dst_stride;
3243        ST_UB2(out2, out3, dst, 16);
3244        dst += dst_stride;
3245    }
3246}
3247
3248static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
3249                                      int32_t src_stride,
3250                                      uint8_t *dst,
3251                                      int32_t dst_stride,
3252                                      const int8_t *filter,
3253                                      int32_t weight,
3254                                      int32_t offset,
3255                                      int32_t rnd_val)
3256{
3257    v16u8 out;
3258    v16i8 src0, src1, src2, src3, src4;
3259    v16i8 src10_r, src32_r, src21_r, src43_r;
3260    v16i8 src2110, src4332;
3261    v8i16 dst0;
3262    v4i32 dst0_r, dst0_l;
3263    v8i16 filt0, filt1;
3264    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3265    v4i32 weight_vec, rnd_vec;
3266
3267    src -= src_stride;
3268
3269    weight = weight & 0x0000FFFF;
3270
3271    weight_vec = __msa_fill_w(weight);
3272    rnd_vec = __msa_fill_w(rnd_val);
3273
3274    weight *= 128;
3275    rnd_val -= 6;
3276
3277    weight_vec_h = __msa_fill_h(weight);
3278    offset_vec = __msa_fill_h(offset);
3279    denom_vec = __msa_fill_h(rnd_val);
3280
3281    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3282    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3283
3284    filter_vec = LD_SH(filter);
3285    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3286
3287    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3288    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3289    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3290    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3291    XORI_B2_128_SB(src2110, src4332);
3292    dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3293    ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
3294    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
3295    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
3296    dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
3297    dst0 = __msa_adds_s_h(dst0, offset_vec);
3298    CLIP_SH_0_255(dst0);
3299    out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
3300    ST_W2(out, 0, 1, dst, dst_stride);
3301}
3302
3303static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
3304                                      int32_t src_stride,
3305                                      uint8_t *dst,
3306                                      int32_t dst_stride,
3307                                      const int8_t *filter,
3308                                      int32_t weight,
3309                                      int32_t offset,
3310                                      int32_t rnd_val)
3311{
3312    v16u8 out;
3313    v16i8 src0, src1, src2, src3, src4, src5, src6;
3314    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3315    v16i8 src2110, src4332, src6554;
3316    v8i16 dst0, dst1;
3317    v8i16 filt0, filt1;
3318    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3319    v4i32 weight_vec, rnd_vec;
3320
3321    src -= src_stride;
3322
3323    weight = weight & 0x0000FFFF;
3324
3325    weight_vec = __msa_fill_w(weight);
3326    rnd_vec = __msa_fill_w(rnd_val);
3327
3328    weight *= 128;
3329    rnd_val -= 6;
3330
3331    weight_vec_h = __msa_fill_h(weight);
3332    offset_vec = __msa_fill_h(offset);
3333    denom_vec = __msa_fill_h(rnd_val);
3334
3335    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3336    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3337
3338    filter_vec = LD_SH(filter);
3339    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3340
3341    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3342    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3343    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3344               src32_r, src43_r, src54_r, src65_r);
3345    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3346               src2110, src4332, src6554);
3347    XORI_B3_128_SB(src2110, src4332, src6554);
3348    dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3349    dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3350    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
3351                                   dst0, dst1);
3352
3353    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3354    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3355}
3356
3357static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
3358                                              int32_t src_stride,
3359                                              uint8_t *dst,
3360                                              int32_t dst_stride,
3361                                              const int8_t *filter,
3362                                              int32_t height,
3363                                              int32_t weight,
3364                                              int32_t offset,
3365                                              int32_t rnd_val)
3366{
3367    int32_t loop_cnt;
3368    v16u8 out0, out1;
3369    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3370    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3371    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3372    v16i8 src2110, src4332, src6554, src8776;
3373    v16i8 src10998;
3374    v8i16 dst0, dst1, dst2, dst3, filt0, filt1;
3375    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3376    v4i32 weight_vec, rnd_vec;
3377
3378    src -= src_stride;
3379
3380    weight = weight & 0x0000FFFF;
3381
3382    weight_vec = __msa_fill_w(weight);
3383    rnd_vec = __msa_fill_w(rnd_val);
3384
3385    weight *= 128;
3386    rnd_val -= 6;
3387
3388    weight_vec_h = __msa_fill_h(weight);
3389    offset_vec = __msa_fill_h(offset);
3390    denom_vec = __msa_fill_h(rnd_val);
3391
3392    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3393    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3394
3395    filter_vec = LD_SH(filter);
3396    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3397
3398    LD_SB3(src, src_stride, src0, src1, src2);
3399    src += (3 * src_stride);
3400    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3401    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3402    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3403
3404    for (loop_cnt = (height >> 3); loop_cnt--;) {
3405        LD_SB8(src, src_stride,
3406               src3, src4, src5, src6, src7, src8, src9, src10);
3407        src += (8 * src_stride);
3408        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3409                   src32_r, src43_r, src54_r, src65_r);
3410        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3411        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3412        ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3413                   src109_r, src98_r, src4332, src6554, src8776, src10998);
3414        XORI_B4_128_SB(src4332, src6554, src8776, src10998);
3415        dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3416        dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3417        dst2 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3418        dst3 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
3419
3420        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3421                                       weight_vec, offset_vec, rnd_vec,
3422                                       dst0, dst1, dst2, dst3);
3423
3424        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3425        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3426        dst += (8 * dst_stride);
3427
3428        src2 = src10;
3429        src2110 = src10998;
3430    }
3431}
3432
3433static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src,
3434                                     int32_t src_stride,
3435                                     uint8_t *dst,
3436                                     int32_t dst_stride,
3437                                     const int8_t *filter,
3438                                     int32_t height,
3439                                     int32_t weight,
3440                                     int32_t offset,
3441                                     int32_t rnd_val)
3442{
3443    if (2 == height) {
3444        hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
3445                                  filter, weight, offset, rnd_val);
3446    } else if (4 == height) {
3447        hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
3448                                  filter, weight, offset, rnd_val);
3449    } else if (0 == (height % 8)) {
3450        hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
3451                                          filter, height, weight, offset,
3452                                          rnd_val);
3453    }
3454}
3455
3456static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
3457                                     int32_t src_stride,
3458                                     uint8_t *dst,
3459                                     int32_t dst_stride,
3460                                     const int8_t *filter,
3461                                     int32_t height,
3462                                     int32_t weight,
3463                                     int32_t offset,
3464                                     int32_t rnd_val)
3465{
3466    v16u8 out0, out1, out2, out3;
3467    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3468    v16i8 src10_r, src32_r, src21_r, src43_r;
3469    v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3470    v8i16 filt0, filt1;
3471    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3472    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3473    v4i32 weight_vec, rnd_vec;
3474
3475    src -= src_stride;
3476
3477    weight = weight & 0x0000FFFF;
3478
3479    weight_vec = __msa_fill_w(weight);
3480    rnd_vec = __msa_fill_w(rnd_val);
3481
3482    weight *= 128;
3483    rnd_val -= 6;
3484
3485    weight_vec_h = __msa_fill_h(weight);
3486    offset_vec = __msa_fill_h(offset);
3487    denom_vec = __msa_fill_h(rnd_val);
3488
3489    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3490    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3491
3492    filter_vec = LD_SH(filter);
3493    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3494
3495    LD_SB3(src, src_stride, src0, src1, src2);
3496    src += (3 * src_stride);
3497    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3498    XORI_B3_128_SB(src0, src1, src2);
3499    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3500    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3501    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3502    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3503    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3504    ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3505    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3506    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3507    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3508    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3509    dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3510    dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3511    dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3512    dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3513
3514    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3515                                   weight_vec, offset_vec, rnd_vec,
3516                                   dst0, dst1, dst2, dst3);
3517    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3518                                   weight_vec, offset_vec, rnd_vec,
3519                                   dst4, dst5, dst6, dst7);
3520
3521    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3522    PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3523    ST_W2(out0, 0, 2, dst, dst_stride);
3524    ST_H2(out0, 2, 6, dst + 4, dst_stride);
3525    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
3526    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3527    dst += (4 * dst_stride);
3528    ST_W2(out2, 0, 2, dst, dst_stride);
3529    ST_H2(out2, 2, 6, dst + 4, dst_stride);
3530    ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
3531    ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3532}
3533
3534static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
3535                                      int32_t src_stride,
3536                                      uint8_t *dst,
3537                                      int32_t dst_stride,
3538                                      const int8_t *filter,
3539                                      int32_t weight,
3540                                      int32_t offset,
3541                                      int32_t rnd_val)
3542{
3543    v16u8 out;
3544    v16i8 src0, src1, src2, src3, src4;
3545    v16i8 src10_r, src32_r, src21_r, src43_r;
3546    v8i16 dst0, dst1;
3547    v8i16 filt0, filt1;
3548    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3549    v4i32 weight_vec, rnd_vec;
3550
3551    src -= src_stride;
3552
3553    weight = weight & 0x0000FFFF;
3554
3555    weight_vec = __msa_fill_w(weight);
3556    rnd_vec = __msa_fill_w(rnd_val);
3557
3558    weight *= 128;
3559    rnd_val -= 6;
3560
3561    weight_vec_h = __msa_fill_h(weight);
3562    offset_vec = __msa_fill_h(offset);
3563    denom_vec = __msa_fill_h(rnd_val);
3564
3565    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3566    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3567
3568    filter_vec = LD_SH(filter);
3569    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3570
3571    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3572    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3573    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3574    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3575    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3576    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3577
3578    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
3579                                   dst0, dst1);
3580
3581    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3582    ST_D2(out, 0, 1, dst, dst_stride);
3583}
3584
3585static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src,
3586                                      int32_t src_stride,
3587                                      uint8_t *dst,
3588                                      int32_t dst_stride,
3589                                      const int8_t *filter,
3590                                      int32_t weight,
3591                                      int32_t offset,
3592                                      int32_t rnd_val)
3593{
3594    v16u8 out0, out1;
3595    v16i8 src0, src1, src2, src3, src4;
3596    v16i8 src10_r, src32_r, src21_r, src43_r;
3597    v16i8 src5, src6, src54_r, src65_r;
3598    v8i16 filt0, filt1;
3599    v8i16 dst0, dst1, dst2, dst3;
3600    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3601    v4i32 weight_vec, rnd_vec;
3602
3603    src -= src_stride;
3604
3605    weight = weight & 0x0000FFFF;
3606
3607    weight_vec = __msa_fill_w(weight);
3608    rnd_vec = __msa_fill_w(rnd_val);
3609
3610    weight *= 128;
3611    rnd_val -= 6;
3612
3613    weight_vec_h = __msa_fill_h(weight);
3614    offset_vec = __msa_fill_h(offset);
3615    denom_vec = __msa_fill_h(rnd_val);
3616
3617    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3618    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3619
3620    filter_vec = LD_SH(filter);
3621    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3622
3623    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3624    src += (3 * src_stride);
3625    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3626    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3627    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3628    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3629    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3630    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3631    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3632    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3633    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3634                                   offset_vec, rnd_vec, dst0, dst1, dst2,
3635                                   dst3);
3636    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3637    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3638}
3639
3640static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
3641                                      int32_t src_stride,
3642                                      uint8_t *dst,
3643                                      int32_t dst_stride,
3644                                      const int8_t *filter,
3645                                      int32_t weight,
3646                                      int32_t offset,
3647                                      int32_t rnd_val)
3648{
3649    v16u8 out0, out1, out2;
3650    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3651    v16i8 src10_r, src32_r, src54_r, src76_r;
3652    v16i8 src21_r, src43_r, src65_r, src87_r;
3653    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3654    v8i16 filt0, filt1;
3655    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3656    v4i32 weight_vec, rnd_vec;
3657
3658    src -= src_stride;
3659
3660    weight = weight & 0x0000FFFF;
3661
3662    weight_vec = __msa_fill_w(weight);
3663    rnd_vec = __msa_fill_w(rnd_val);
3664
3665    weight *= 128;
3666    rnd_val -= 6;
3667
3668    weight_vec_h = __msa_fill_h(weight);
3669    offset_vec = __msa_fill_h(offset);
3670    denom_vec = __msa_fill_h(rnd_val);
3671
3672    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3673    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3674
3675    filter_vec = LD_SH(filter);
3676    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3677
3678    LD_SB3(src, src_stride, src0, src1, src2);
3679    src += (3 * src_stride);
3680    LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3681
3682    XORI_B3_128_SB(src0, src1, src2);
3683    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3684    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3685               src32_r, src43_r);
3686    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3687               src76_r, src87_r);
3688    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3689    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3690    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3691    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3692    dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3693    dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3694    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3695                                   offset_vec, rnd_vec, dst0, dst1, dst2, dst3);
3696    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
3697                                   dst4, dst5);
3698    PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3699    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3700    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3701}
3702
3703static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src,
3704                                          int32_t src_stride,
3705                                          uint8_t *dst,
3706                                          int32_t dst_stride,
3707                                          const int8_t *filter,
3708                                          int32_t height,
3709                                          int32_t weight,
3710                                          int32_t offset,
3711                                          int32_t rnd_val)
3712{
3713    int32_t loop_cnt;
3714    v16u8 out0, out1, out2, out3;
3715    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3716    v16i8 src10_r, src32_r, src21_r, src43_r;
3717    v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3718    v8i16 filt0, filt1;
3719    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3720    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3721    v4i32 weight_vec, rnd_vec;
3722
3723    src -= src_stride;
3724
3725    weight = weight & 0x0000FFFF;
3726
3727    weight_vec = __msa_fill_w(weight);
3728    rnd_vec = __msa_fill_w(rnd_val);
3729
3730    weight *= 128;
3731    rnd_val -= 6;
3732
3733    weight_vec_h = __msa_fill_h(weight);
3734    offset_vec = __msa_fill_h(offset);
3735    denom_vec = __msa_fill_h(rnd_val);
3736
3737    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3738    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3739
3740    filter_vec = LD_SH(filter);
3741    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3742
3743    LD_SB3(src, src_stride, src0, src1, src2);
3744    src += (3 * src_stride);
3745    XORI_B3_128_SB(src0, src1, src2);
3746    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3747
3748    for (loop_cnt = (height >> 3); loop_cnt--;) {
3749        LD_SB8(src, src_stride,
3750               src3, src4, src5, src6, src7, src8, src9, src10);
3751        src += (8 * src_stride);
3752        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3753        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3754        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3755        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3756        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3758        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3759        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3760        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3761        dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3762        dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3763        dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3764        dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3765        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3766                                       offset_vec, rnd_vec, dst0, dst1, dst2,
3767                                       dst3);
3768        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
3769                                       offset_vec, rnd_vec, dst4, dst5, dst6,
3770                                       dst7);
3771        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3772        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3773        ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3774        dst += (8 * dst_stride);
3775
3776        src2 = src10;
3777        src10_r = src98_r;
3778        src21_r = src109_r;
3779    }
3780}
3781
3782static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src,
3783                                     int32_t src_stride,
3784                                     uint8_t *dst,
3785                                     int32_t dst_stride,
3786                                     const int8_t *filter,
3787                                     int32_t height,
3788                                     int32_t weight,
3789                                     int32_t offset,
3790                                     int32_t rnd_val)
3791{
3792    if (2 == height) {
3793        hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
3794                                  filter, weight, offset, rnd_val);
3795    } else if (4 == height) {
3796        hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
3797                                  filter, weight, offset, rnd_val);
3798    } else if (6 == height) {
3799        hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
3800                                  filter, weight, offset, rnd_val);
3801    } else {
3802        hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride,
3803                                      filter, height, weight, offset,
3804                                      rnd_val);
3805    }
3806}
3807
3808static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
3809                                      int32_t src_stride,
3810                                      uint8_t *dst,
3811                                      int32_t dst_stride,
3812                                      const int8_t *filter,
3813                                      int32_t height,
3814                                      int32_t weight,
3815                                      int32_t offset,
3816                                      int32_t rnd_val)
3817{
3818    int32_t loop_cnt;
3819    v16u8 out0, out1, out2, out3, out4, out5;
3820    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3821    v16i8 src10_r, src32_r, src21_r, src43_r;
3822    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3823    v16i8 src2110, src4332;
3824    v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
3825    v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
3826    v8i16 filt0, filt1;
3827    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3828    v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec;
3829    v4i32 weight_vec, rnd_vec;
3830
3831    src -= (1 * src_stride);
3832
3833    weight = weight & 0x0000FFFF;
3834
3835    weight_vec = __msa_fill_w(weight);
3836    rnd_vec = __msa_fill_w(rnd_val);
3837
3838    weight *= 128;
3839    rnd_val -= 6;
3840
3841    weight_vec_h = __msa_fill_h(weight);
3842    offset_vec = __msa_fill_h(offset);
3843    denom_vec = __msa_fill_h(rnd_val);
3844
3845    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3846    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3847
3848    filter_vec = LD_SH(filter);
3849    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3850
3851    LD_SB3(src, src_stride, src0, src1, src2);
3852    src += (3 * src_stride);
3853    XORI_B3_128_SB(src0, src1, src2);
3854    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3855    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3856    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3857
3858    for (loop_cnt = 2; loop_cnt--;) {
3859        LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3860        src += (8 * src_stride);
3861        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3862        ILVRL_B2_SB(src3, src2, src32_r, src32_l);
3863        ILVRL_B2_SB(src4, src3, src43_r, src43_l);
3864        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
3865        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
3866        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3867        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3868        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3869        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3870        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3871        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3872        dst4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3873        dst5 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3874        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3875                                       offset_vec, rnd_vec, dst0, dst1, dst2,
3876                                       dst3);
3877        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
3878                                       rnd_vec, dst4, dst5);
3879        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3880        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3881        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
3882        dst += (4 * dst_stride);
3883
3884        ILVRL_B2_SB(src7, src6, src76_r, src76_l);
3885        ILVRL_B2_SB(src8, src7, src87_r, src87_l);
3886        ILVRL_B2_SB(src9, src8, src98_r, src98_l);
3887        ILVRL_B2_SB(src10, src9, src109_r, src109_l);
3888        src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
3889        src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
3890        dst6 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3891        dst7 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3892        dst8 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3893        dst9 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3894        dst10 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3895        dst11 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
3896        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst6, dst7, dst8, dst9, weight_vec,
3897                                       offset_vec, rnd_vec, dst6, dst7, dst8,
3898                                       dst9);
3899        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec,
3900                                       rnd_vec, dst10, dst11);
3901        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
3902        ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride);
3903        ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride);
3904        dst += (4 * dst_stride);
3905
3906        src2 = src10;
3907        src10_r = src98_r;
3908        src21_r = src109_r;
3909        src2110 = src10998;
3910    }
3911}
3912
3913static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src,
3914                                      int32_t src_stride,
3915                                      uint8_t *dst,
3916                                      int32_t dst_stride,
3917                                      const int8_t *filter,
3918                                      int32_t height,
3919                                      int32_t weight,
3920                                      int32_t offset,
3921                                      int32_t rnd_val)
3922{
3923    int32_t loop_cnt;
3924    v16u8 out0, out1, out2, out3;
3925    v16i8 src0, src1, src2, src3, src4, src5;
3926    v16i8 src10_r, src32_r, src21_r, src43_r;
3927    v16i8 src10_l, src32_l, src21_l, src43_l;
3928    v16i8 src54_r, src54_l, src65_r, src65_l, src6;
3929    v8i16 filt0, filt1;
3930    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3931    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3932    v4i32 weight_vec, rnd_vec;
3933
3934    src -= src_stride;
3935
3936    weight = weight & 0x0000FFFF;
3937
3938    weight_vec = __msa_fill_w(weight);
3939    rnd_vec = __msa_fill_w(rnd_val);
3940
3941    weight *= 128;
3942    rnd_val -= 6;
3943
3944    weight_vec_h = __msa_fill_h(weight);
3945    offset_vec = __msa_fill_h(offset);
3946    denom_vec = __msa_fill_h(rnd_val);
3947
3948    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3949    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3950
3951    filter_vec = LD_SH(filter);
3952    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3953
3954    LD_SB3(src, src_stride, src0, src1, src2);
3955    src += (3 * src_stride);
3956    XORI_B3_128_SB(src0, src1, src2);
3957    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3958    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3959
3960    for (loop_cnt = (height >> 2); loop_cnt--;) {
3961        LD_SB4(src, src_stride, src3, src4, src5, src6);
3962        src += (4 * src_stride);
3963        XORI_B4_128_SB(src3, src4, src5, src6);
3964        ILVRL_B2_SB(src3, src2, src32_r, src32_l);
3965        ILVRL_B2_SB(src4, src3, src43_r, src43_l);
3966        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
3967        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
3968        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3969        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3970        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3971        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3972        dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3973        dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3974        dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
3975        dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
3976        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3977                                       offset_vec, rnd_vec, dst0, dst1, dst2,
3978                                       dst3);
3979        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
3980                                       offset_vec, rnd_vec, dst4, dst5, dst6,
3981                                       dst7);
3982        PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
3983                    out2, out3);
3984        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3985        dst += (4 * dst_stride);
3986
3987        src2 = src6;
3988        src10_r = src54_r;
3989        src21_r = src65_r;
3990        src10_l = src54_l;
3991        src21_l = src65_l;
3992    }
3993}
3994
3995static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
3996                                      int32_t src_stride,
3997                                      uint8_t *dst,
3998                                      int32_t dst_stride,
3999                                      const int8_t *filter,
4000                                      int32_t height,
4001                                      int32_t weight,
4002                                      int32_t offset,
4003                                      int32_t rnd_val)
4004{
4005    uint32_t loop_cnt;
4006    v16u8 out0, out1, out2, out3, out4, out5;
4007    v16i8 src0, src1, src2, src3, src4, src5;
4008    v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
4009    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
4010    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4011    v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
4012    v8i16 filt0, filt1;
4013    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
4014    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11;
4015    v4i32 weight_vec, rnd_vec;
4016
4017    src -= src_stride;
4018
4019    weight = weight & 0x0000FFFF;
4020
4021    weight_vec = __msa_fill_w(weight);
4022    rnd_vec = __msa_fill_w(rnd_val);
4023
4024    weight *= 128;
4025    rnd_val -= 6;
4026
4027    weight_vec_h = __msa_fill_h(weight);
4028    offset_vec = __msa_fill_h(offset);
4029    denom_vec = __msa_fill_h(rnd_val);
4030
4031    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4032    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4033
4034    filter_vec = LD_SH(filter);
4035    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4036
4037    LD_SB3(src, src_stride, src0, src1, src2);
4038    LD_SB3(src + 16, src_stride, src7, src8, src9);
4039    src += (3 * src_stride);
4040    XORI_B3_128_SB(src0, src1, src2);
4041    XORI_B3_128_SB(src7, src8, src9);
4042    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4043    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4044    ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
4045
4046    for (loop_cnt = 8; loop_cnt--;) {
4047        LD_SB4(src, src_stride, src3, src4, src5, src6);
4048        LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
4049        src += (4 * src_stride);
4050        XORI_B4_128_SB(src3, src4, src5, src6);
4051        XORI_B4_128_SB(src10, src11, src12, src13);
4052        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4053        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4054        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
4055        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
4056        ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
4057        ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
4058        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4059        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4060        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
4061        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
4062        dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4063        dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4064        dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
4065        dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
4066        dst8 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4067        dst9 = HEVC_FILT_4TAP_SH(src98_r, src1110_r, filt0, filt1);
4068        dst10 = HEVC_FILT_4TAP_SH(src109_r, src1211_r, filt0, filt1);
4069        dst11 = HEVC_FILT_4TAP_SH(src1110_r, src1312_r, filt0, filt1);
4070        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
4071                                       offset_vec, rnd_vec, dst0, dst1, dst2,
4072                                       dst3);
4073        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
4074                                       offset_vec, rnd_vec, dst4, dst5, dst6,
4075                                       dst7);
4076        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
4077                                       offset_vec, rnd_vec, dst8, dst9, dst10,
4078                                       dst11);
4079        PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
4080                    out2, out3);
4081        PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
4082        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4083        ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride);
4084        dst += (4 * dst_stride);
4085
4086        src2 = src6;
4087        src9 = src13;
4088        src10_r = src54_r;
4089        src21_r = src65_r;
4090        src10_l = src54_l;
4091        src21_l = src65_l;
4092        src87_r = src1211_r;
4093        src98_r = src1312_r;
4094    }
4095}
4096
4097static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src,
4098                                      int32_t src_stride,
4099                                      uint8_t *dst,
4100                                      int32_t dst_stride,
4101                                      const int8_t *filter,
4102                                      int32_t height,
4103                                      int32_t weight,
4104                                      int32_t offset,
4105                                      int32_t rnd_val)
4106{
4107    uint32_t loop_cnt;
4108    v16u8 out0, out1, out2, out3;
4109    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
4110    v16i8 src10_r, src32_r, src76_r, src98_r;
4111    v16i8 src21_r, src43_r, src65_r, src87_r;
4112    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4113    v16i8 src10_l, src32_l, src76_l, src98_l;
4114    v16i8 src21_l, src43_l, src65_l, src87_l;
4115    v8i16 filt0, filt1;
4116    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
4117    v4i32 weight_vec, rnd_vec;
4118
4119    src -= src_stride;
4120
4121    weight = weight & 0x0000FFFF;
4122
4123    weight_vec = __msa_fill_w(weight);
4124    rnd_vec = __msa_fill_w(rnd_val);
4125
4126    weight *= 128;
4127    rnd_val -= 6;
4128
4129    weight_vec_h = __msa_fill_h(weight);
4130    offset_vec = __msa_fill_h(offset);
4131    denom_vec = __msa_fill_h(rnd_val);
4132
4133    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4134    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4135
4136    filter_vec = LD_SH(filter);
4137    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4138
4139    LD_SB3(src, src_stride, src0, src1, src2);
4140    LD_SB3(src + 16, src_stride, src5, src6, src7);
4141    src += (3 * src_stride);
4142    XORI_B6_128_SB(src0, src1, src2, src5, src6, src7);
4143    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4144    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4145    ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
4146    ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
4147
4148    for (loop_cnt = (height >> 1); loop_cnt--;) {
4149        LD_SB2(src, src_stride, src3, src4);
4150        LD_SB2(src + 16, src_stride, src8, src9);
4151        src += (2 * src_stride);
4152        XORI_B4_128_SB(src3, src4, src8, src9);
4153        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4154        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4155        ILVRL_B2_SB(src8, src7, src87_r, src87_l);
4156        ILVRL_B2_SB(src9, src8, src98_r, src98_l);
4157        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4158        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4159        dst2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4160        dst3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4161        dst4 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
4162        dst5 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4163        dst6 = HEVC_FILT_4TAP_SH(src65_l, src87_l, filt0, filt1);
4164        dst7 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4165        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
4166                                       offset_vec, rnd_vec, dst0, dst1, dst2,
4167                                       dst3);
4168        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
4169                                       offset_vec, rnd_vec, dst4, dst5, dst6,
4170                                       dst7);
4171        PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1,
4172                    out2, out3);
4173        ST_UB2(out0, out2, dst, 16);
4174        dst += dst_stride;
4175        ST_UB2(out1, out3, dst, 16);
4176        dst += dst_stride;
4177
4178        src2 = src4;
4179        src7 = src9;
4180        src10_r = src32_r;
4181        src21_r = src43_r;
4182        src10_l = src32_l;
4183        src21_l = src43_l;
4184        src65_r = src87_r;
4185        src76_r = src98_r;
4186        src65_l = src87_l;
4187        src76_l = src98_l;
4188    }
4189}
4190
4191static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
4192                                      int32_t src_stride,
4193                                      uint8_t *dst,
4194                                      int32_t dst_stride,
4195                                      const int8_t *filter_x,
4196                                      const int8_t *filter_y,
4197                                      int32_t weight,
4198                                      int32_t offset,
4199                                      int32_t rnd_val)
4200{
4201    v16u8 out;
4202    v16i8 src0, src1, src2, src3, src4;
4203    v8i16 filt0, filt1;
4204    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4205    v16i8 mask1;
4206    v8i16 filt_h0, filt_h1, filter_vec, tmp;
4207    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4208    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
4209    v8i16 offset_vec, const_128, denom_vec;
4210    v4i32 dst0, dst1, weight_vec, rnd_vec;
4211
4212    src -= (src_stride + 1);
4213
4214    filter_vec = LD_SH(filter_x);
4215    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4216
4217    filter_vec = LD_SH(filter_y);
4218    UNPCK_R_SB_SH(filter_vec, filter_vec);
4219
4220    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4221
4222    mask1 = mask0 + 2;
4223
4224    weight_vec = __msa_fill_w(weight);
4225    rnd_vec = __msa_fill_w(rnd_val);
4226
4227    offset_vec = __msa_fill_h(offset);
4228    denom_vec = __msa_fill_h(rnd_val - 6);
4229    const_128 = __msa_fill_h((128 * weight));
4230    offset_vec += __msa_srar_h(const_128, denom_vec);
4231
4232    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4233    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4234    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4235    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4236    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4237    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4238    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4239    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4240    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4241    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4242    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4243    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4244    dst0 >>= 6;
4245    dst1 >>= 6;
4246    MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4247    SRAR_W2_SW(dst0, dst1, rnd_vec);
4248    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4249    tmp += offset_vec;
4250    CLIP_SH_0_255(tmp);
4251    out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4252    ST_W2(out, 0, 1, dst, dst_stride);
4253}
4254
4255static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
4256                                      int32_t src_stride,
4257                                      uint8_t *dst,
4258                                      int32_t dst_stride,
4259                                      const int8_t *filter_x,
4260                                      const int8_t *filter_y,
4261                                      int32_t weight,
4262                                      int32_t offset,
4263                                      int32_t rnd_val)
4264{
4265    v16u8 out;
4266    v16i8 src0, src1, src2, src3, src4, src5, src6;
4267    v8i16 filt0, filt1;
4268    v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1;
4269    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4270    v16i8 mask1;
4271    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4272    v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
4273    v8i16 offset_vec, const_128, denom_vec;
4274    v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec;
4275
4276    src -= (src_stride + 1);
4277
4278    filter_vec = LD_SH(filter_x);
4279    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4280
4281    filter_vec = LD_SH(filter_y);
4282    UNPCK_R_SB_SH(filter_vec, filter_vec);
4283
4284    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4285
4286    mask1 = mask0 + 2;
4287
4288    weight_vec = __msa_fill_w(weight);
4289    rnd_vec = __msa_fill_w(rnd_val);
4290
4291    offset_vec = __msa_fill_h(offset);
4292    denom_vec = __msa_fill_h(rnd_val - 6);
4293    const_128 = __msa_fill_h((128 * weight));
4294    offset_vec += __msa_srar_h(const_128, denom_vec);
4295
4296    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4297    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4298    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4299    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4300    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4301    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4302    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4303    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4304    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4305    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4306    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4307    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4308    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4309    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4310    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4311    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4312    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4313    SRA_4V(dst0, dst1, dst2, dst3, 6);
4314    MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4315    MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4316    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4317    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4318    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4319    CLIP_SH2_0_255(tmp0, tmp1);
4320    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4321    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4322}
4323
4324static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
4325                                              int32_t src_stride,
4326                                              uint8_t *dst,
4327                                              int32_t dst_stride,
4328                                              const int8_t *filter_x,
4329                                              const int8_t *filter_y,
4330                                              int32_t height,
4331                                              int32_t weight,
4332                                              int32_t offset,
4333                                              int32_t rnd_val)
4334{
4335    uint32_t loop_cnt;
4336    v16u8 out0, out1;
4337    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4338    v8i16 filt0, filt1;
4339    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4340    v16i8 mask1;
4341    v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4342    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4343    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4344    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4345    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4346    v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec;
4347    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
4348
4349    src -= (src_stride + 1);
4350
4351    filter_vec = LD_SH(filter_x);
4352    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4353
4354    filter_vec = LD_SH(filter_y);
4355    UNPCK_R_SB_SH(filter_vec, filter_vec);
4356
4357    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4358
4359    mask1 = mask0 + 2;
4360
4361    weight_vec = __msa_fill_w(weight);
4362    rnd_vec = __msa_fill_w(rnd_val);
4363
4364    offset_vec = __msa_fill_h(offset);
4365    denom_vec = __msa_fill_h(rnd_val - 6);
4366    const_128 = __msa_fill_h((128 * weight));
4367    offset_vec += __msa_srar_h(const_128, denom_vec);
4368
4369    LD_SB3(src, src_stride, src0, src1, src2);
4370    src += (3 * src_stride);
4371    XORI_B3_128_SB(src0, src1, src2);
4372
4373    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4374    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4375    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4376    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4377    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4378    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4379
4380    for (loop_cnt = height >> 3; loop_cnt--;) {
4381        LD_SB8(src, src_stride,
4382               src3, src4, src5, src6, src7, src8, src9, src10);
4383        src += (8 * src_stride);
4384        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4385
4386        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4387        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4388        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4389        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4390        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4391        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4392        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4393        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4394        dst32_r = __msa_ilvr_h(dst73, dst22);
4395        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4396        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4397        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4398        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4399        dst76_r = __msa_ilvr_h(dst22, dst106);
4400        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4401        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4402        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4403        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4404        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4405        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4406        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4407        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4408        SRA_4V(dst0, dst1, dst2, dst3, 6);
4409        SRA_4V(dst4, dst5, dst6, dst7, 6);
4410        MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4411        MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4412        MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
4413        MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
4414        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4415        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4416        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4417                    tmp2, tmp3);
4418        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4419        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4420        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4421        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4422        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4423        dst += (8 * dst_stride);
4424
4425        dst10_r = dst98_r;
4426        dst21_r = dst109_r;
4427        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4428    }
4429}
4430
4431static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src,
4432                                     int32_t src_stride,
4433                                     uint8_t *dst,
4434                                     int32_t dst_stride,
4435                                     const int8_t *filter_x,
4436                                     const int8_t *filter_y,
4437                                     int32_t height,
4438                                     int32_t weight,
4439                                     int32_t offset,
4440                                     int32_t rnd_val)
4441{
4442    if (2 == height) {
4443        hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
4444                                  filter_x, filter_y, weight,
4445                                  offset, rnd_val);
4446    } else if (4 == height) {
4447        hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
4448                                  filter_x,filter_y, weight,
4449                                  offset, rnd_val);
4450    } else if (0 == (height % 8)) {
4451        hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
4452                                          filter_x, filter_y, height, weight,
4453                                          offset, rnd_val);
4454    }
4455}
4456
4457static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
4458                                     int32_t src_stride,
4459                                     uint8_t *dst,
4460                                     int32_t dst_stride,
4461                                     const int8_t *filter_x,
4462                                     const int8_t *filter_y,
4463                                     int32_t height,
4464                                     int32_t weight,
4465                                     int32_t offset,
4466                                     int32_t rnd_val)
4467{
4468    v16u8 out0, out1, out2;
4469    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4470    v8i16 filt0, filt1;
4471    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4472    v16i8 mask1;
4473    v8i16 filt_h0, filt_h1, filter_vec;
4474    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4475    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4476    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4477    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4478    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4479    v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4480    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4481    v8i16 offset_vec, const_128, denom_vec;
4482    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4483    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec;
4484
4485    src -= (src_stride + 1);
4486
4487    filter_vec = LD_SH(filter_x);
4488    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4489
4490    filter_vec = LD_SH(filter_y);
4491    UNPCK_R_SB_SH(filter_vec, filter_vec);
4492
4493    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4494
4495    mask1 = mask0 + 2;
4496
4497    weight_vec = __msa_fill_w(weight);
4498    rnd_vec = __msa_fill_w(rnd_val);
4499
4500    offset_vec = __msa_fill_h(offset);
4501    denom_vec = __msa_fill_h(rnd_val - 6);
4502    const_128 = __msa_fill_h((128 * weight));
4503    offset_vec += __msa_srar_h(const_128, denom_vec);
4504
4505    LD_SB3(src, src_stride, src0, src1, src2);
4506    src += (3 * src_stride);
4507    XORI_B3_128_SB(src0, src1, src2);
4508
4509    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4510    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4511    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4512    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4513    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4514    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4515    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4516    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4517
4518    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
4519    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4520    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4521    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4522    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4523    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4524    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4525    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4526    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4527    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4528    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4529    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4530    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4531    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4532    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4533    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4534    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4535    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4536    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4537    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4538    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4539    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4540    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4541    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4542    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4543    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4544    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4545    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4546    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4547    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4548    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4549    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4550    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4551    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4552    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4553    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4554    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4555    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4556    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4557    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4558    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4559    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4560    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4561    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4562    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4563    MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4564    MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4565    MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
4566    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4567    MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4568    SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
4569    SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
4570    SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
4571    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4572    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4573    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4574    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4575    ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4576    ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4577    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4578    CLIP_SH2_0_255(tmp4, tmp5);
4579    PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4580    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4581    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4582}
4583
4584static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
4585                                      int32_t src_stride,
4586                                      uint8_t *dst,
4587                                      int32_t dst_stride,
4588                                      const int8_t *filter_x,
4589                                      const int8_t *filter_y,
4590                                      int32_t weight,
4591                                      int32_t offset,
4592                                      int32_t rnd_val)
4593{
4594    v16u8 out;
4595    v16i8 src0, src1, src2, src3, src4;
4596    v8i16 filt0, filt1;
4597    v8i16 filt_h0, filt_h1, filter_vec;
4598    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4599    v16i8 mask1;
4600    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4601    v8i16 dst0, dst1, dst2, dst3, dst4;
4602    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4603    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4604    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4605    v8i16 tmp0, tmp1;
4606    v8i16 offset_vec, const_128, denom_vec;
4607    v4i32 weight_vec, rnd_vec;
4608
4609    src -= (src_stride + 1);
4610
4611    filter_vec = LD_SH(filter_x);
4612    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4613
4614    filter_vec = LD_SH(filter_y);
4615    UNPCK_R_SB_SH(filter_vec, filter_vec);
4616
4617    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4618
4619    mask1 = mask0 + 2;
4620
4621    weight_vec = __msa_fill_w(weight);
4622    rnd_vec = __msa_fill_w(rnd_val);
4623
4624    offset_vec = __msa_fill_h(offset);
4625    denom_vec = __msa_fill_h(rnd_val - 6);
4626    const_128 = __msa_fill_h((128 * weight));
4627    offset_vec += __msa_srar_h(const_128, denom_vec);
4628
4629    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4630    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4631    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4632    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4633    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4634    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4635    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4636    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4637    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4638    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4639    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4640    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4641    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4642    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4643    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4644    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4645    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4646    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4647    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4648    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4649    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4650    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4651    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4652    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4653    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4654    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4655    CLIP_SH2_0_255(tmp0, tmp1);
4656    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4657    ST_D2(out, 0, 1, dst, dst_stride);
4658}
4659
4660static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
4661                                          int32_t src_stride,
4662                                          uint8_t *dst,
4663                                          int32_t dst_stride,
4664                                          const int8_t *filter_x,
4665                                          const int8_t *filter_y,
4666                                          int32_t width8mult,
4667                                          int32_t weight,
4668                                          int32_t offset,
4669                                          int32_t rnd_val)
4670{
4671    uint32_t cnt;
4672    v16u8 out0, out1;
4673    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4674    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
4676    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4677    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4678    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4679    v8i16 offset_vec, const_128, denom_vec;
4680    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4681    v4i32 weight_vec, rnd_vec;
4682
4683    src -= (src_stride + 1);
4684
4685    filter_vec = LD_SH(filter_x);
4686    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4687
4688    filter_vec = LD_SH(filter_y);
4689    UNPCK_R_SB_SH(filter_vec, filter_vec);
4690
4691    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4692
4693    mask0 = LD_SB(ff_hevc_mask_arr);
4694    mask1 = mask0 + 2;
4695
4696    weight_vec = __msa_fill_w(weight);
4697    rnd_vec = __msa_fill_w(rnd_val);
4698
4699    offset_vec = __msa_fill_h(offset);
4700    denom_vec = __msa_fill_h(rnd_val - 6);
4701    const_128 = __msa_fill_h((128 * weight));
4702    offset_vec += __msa_srar_h(const_128, denom_vec);
4703
4704    for (cnt = width8mult; cnt--;) {
4705        LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4706        src += 8;
4707        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4708        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4709        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4710        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4711        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4712        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4713        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4714        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4715        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4716        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4717        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4718        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4719        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4720        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4721        dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4722        dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4723        dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4724        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4725        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4726        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4727        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4728        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4729        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4730        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4731        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4732        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4733        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4734        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4735        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4736        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4737        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4738        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4739        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4740        MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4741        MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4742        SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4743        SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4744        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4745                    dst3_r, tmp0, tmp1, tmp2, tmp3);
4746        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4747        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4748        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4749        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4750        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4751        dst += 8;
4752    }
4753}
4754
4755static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
4756                                      int32_t src_stride,
4757                                      uint8_t *dst,
4758                                      int32_t dst_stride,
4759                                      const int8_t *filter_x,
4760                                      const int8_t *filter_y,
4761                                      int32_t weight,
4762                                      int32_t offset,
4763                                      int32_t rnd_val)
4764{
4765    v16u8 out0, out1, out2;
4766    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4767    v8i16 filt0, filt1;
4768    v8i16 filt_h0, filt_h1, filter_vec;
4769    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4770    v16i8 mask1;
4771    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4772    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4773    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4774    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4775    v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec;
4776    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4777    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4778    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4779    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4780    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4781    v8i16 offset_vec, const_128, denom_vec;
4782
4783    src -= (src_stride + 1);
4784
4785    filter_vec = LD_SH(filter_x);
4786    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4787
4788    filter_vec = LD_SH(filter_y);
4789    UNPCK_R_SB_SH(filter_vec, filter_vec);
4790
4791    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4792
4793    mask1 = mask0 + 2;
4794
4795    weight_vec = __msa_fill_w(weight);
4796    rnd_vec = __msa_fill_w(rnd_val);
4797
4798    offset_vec = __msa_fill_h(offset);
4799    denom_vec = __msa_fill_h(rnd_val - 6);
4800    const_128 = __msa_fill_h((128 * weight));
4801    offset_vec += __msa_srar_h(const_128, denom_vec);
4802
4803    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4804    src += (5 * src_stride);
4805    LD_SB4(src, src_stride, src5, src6, src7, src8);
4806    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4807    XORI_B4_128_SB(src5, src6, src7, src8);
4808    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4809    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4810    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4811    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4812    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4813    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4814    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4815    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4816    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4817    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4818    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4819    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4820    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4821    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4822    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4823    dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4824    dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4825    dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4826    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4827    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4828    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4829    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4830    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4831    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4832    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4833    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4834    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4835    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4836    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4837    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4838    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4839    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4840    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4841    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4842    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4843    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4844    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4845    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4846    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4847    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4848    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4849    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4850    MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4851    MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4852    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4853    MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4854    MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
4855    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4856    SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4857    SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
4858    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4859                tmp0, tmp1, tmp2, tmp3);
4860    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4861    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4862    ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4863    ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4864    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4865    CLIP_SH2_0_255(tmp4, tmp5);
4866    PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4867    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4868    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4869}
4870
4871static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
4872                                              int32_t src_stride,
4873                                              uint8_t *dst,
4874                                              int32_t dst_stride,
4875                                              const int8_t *filter_x,
4876                                              const int8_t *filter_y,
4877                                              int32_t height,
4878                                              int32_t weight,
4879                                              int32_t offset,
4880                                              int32_t rnd_val,
4881                                              int32_t width8mult)
4882{
4883    uint32_t loop_cnt, cnt;
4884    uint8_t *src_tmp;
4885    uint8_t *dst_tmp;
4886    v16u8 out0, out1;
4887    v16i8 src0, src1, src2, src3, src4, src5, src6;
4888    v8i16 filt0, filt1;
4889    v8i16 filt_h0, filt_h1, filter_vec;
4890    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4891    v16i8 mask1;
4892    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4893    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4894    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4895    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4896    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4897    v8i16 offset_vec, const_128, denom_vec;
4898    v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4899    v4i32 weight_vec, rnd_vec;
4900
4901    src -= (src_stride + 1);
4902
4903    filter_vec = LD_SH(filter_x);
4904    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4905
4906    filter_vec = LD_SH(filter_y);
4907    UNPCK_R_SB_SH(filter_vec, filter_vec);
4908
4909    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4910
4911    mask1 = mask0 + 2;
4912
4913    weight_vec = __msa_fill_w(weight);
4914    rnd_vec = __msa_fill_w(rnd_val);
4915
4916    offset_vec = __msa_fill_h(offset);
4917    denom_vec = __msa_fill_h(rnd_val - 6);
4918    const_128 = __msa_fill_h((128 * weight));
4919    offset_vec += __msa_srar_h(const_128, denom_vec);
4920
4921    for (cnt = width8mult; cnt--;) {
4922        src_tmp = src;
4923        dst_tmp = dst;
4924
4925        LD_SB3(src_tmp, src_stride, src0, src1, src2);
4926        src_tmp += (3 * src_stride);
4927        XORI_B3_128_SB(src0, src1, src2);
4928
4929        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4930        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4931        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4932        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4933        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4934        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4935
4936        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4937        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4938
4939        for (loop_cnt = height >> 2; loop_cnt--;) {
4940            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4941            src_tmp += (4 * src_stride);
4942            XORI_B4_128_SB(src3, src4, src5, src6);
4943
4944            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4945            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4946            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4947            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4948            dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4949            dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4950            dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4951            dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4952            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4953            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4954            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4955            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4956            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4957            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4958            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4959            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4960            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4961            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4962            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4963            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4964            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4965            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4966            MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4967            MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4968            MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4969            MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4970            SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4971            SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4972            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4973                        dst3_r, tmp0, tmp1, tmp2, tmp3);
4974            ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4975            ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4976            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4977            PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4978            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4979            dst_tmp += (4 * dst_stride);
4980
4981            dst10_r = dst54_r;
4982            dst10_l = dst54_l;
4983            dst21_r = dst65_r;
4984            dst21_l = dst65_l;
4985            dst2 = dst6;
4986        }
4987
4988        src += 8;
4989        dst += 8;
4990    }
4991}
4992
4993static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src,
4994                                     int32_t src_stride,
4995                                     uint8_t *dst,
4996                                     int32_t dst_stride,
4997                                     const int8_t *filter_x,
4998                                     const int8_t *filter_y,
4999                                     int32_t height,
5000                                     int32_t weight,
5001                                     int32_t offset,
5002                                     int32_t rnd_val)
5003{
5004
5005    if (2 == height) {
5006        hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
5007                                  filter_x, filter_y, weight,
5008                                  offset, rnd_val);
5009    } else if (4 == height) {
5010        hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
5011                                      filter_x, filter_y, 1, weight,
5012                                      offset, rnd_val);
5013    } else if (6 == height) {
5014        hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
5015                                  filter_x, filter_y, weight,
5016                                  offset, rnd_val);
5017    } else if (0 == (height % 4)) {
5018        hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5019                                          filter_x, filter_y, height, weight,
5020                                          offset, rnd_val, 1);
5021    }
5022}
5023
5024static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
5025                                      int32_t src_stride,
5026                                      uint8_t *dst,
5027                                      int32_t dst_stride,
5028                                      const int8_t *filter_x,
5029                                      const int8_t *filter_y,
5030                                      int32_t height,
5031                                      int32_t weight,
5032                                      int32_t offset,
5033                                      int32_t rnd_val)
5034{
5035    uint32_t loop_cnt;
5036    uint8_t *src_tmp, *dst_tmp;
5037    v16u8 out0, out1;
5038    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5039    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5040    v16i8 mask0, mask1, mask2, mask3;
5041    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
5042    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5043    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5044    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5045    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5046    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5047    v8i16 offset_vec, const_128, denom_vec;
5048    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5049    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
5050
5051    src -= (src_stride + 1);
5052
5053    filter_vec = LD_SH(filter_x);
5054    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5055
5056    filter_vec = LD_SH(filter_y);
5057    UNPCK_R_SB_SH(filter_vec, filter_vec);
5058
5059    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5060
5061    mask0 = LD_SB(ff_hevc_mask_arr);
5062    mask1 = mask0 + 2;
5063
5064    weight_vec = __msa_fill_w(weight);
5065    rnd_vec = __msa_fill_w(rnd_val);
5066
5067    offset_vec = __msa_fill_h(offset);
5068    denom_vec = __msa_fill_h(rnd_val - 6);
5069    const_128 = __msa_fill_h((128 * weight));
5070    offset_vec += __msa_srar_h(const_128, denom_vec);
5071
5072    src_tmp = src;
5073    dst_tmp = dst;
5074
5075    LD_SB3(src_tmp, src_stride, src0, src1, src2);
5076    src_tmp += (3 * src_stride);
5077    XORI_B3_128_SB(src0, src1, src2);
5078    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5079    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5080    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5081    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5082    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5083    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5084    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5085    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5086
5087    for (loop_cnt = 4; loop_cnt--;) {
5088        LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
5089        src_tmp += (4 * src_stride);
5090        XORI_B4_128_SB(src3, src4, src5, src6);
5091        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5092        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5093        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5094        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5095        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5096        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5097        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5098        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5099        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5100        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5101        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5102        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5103        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5104        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5105        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5106        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5107        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5108        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5109        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5110        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5111        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5112        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5113        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5114        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5115        MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5116        MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5117        SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5118        SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
5119        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5120                    dst3_r, tmp0, tmp1, tmp2, tmp3);
5121        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5122        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5123        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5124        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5125        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5126        dst_tmp += (4 * dst_stride);
5127
5128        dst10_r = dst54_r;
5129        dst10_l = dst54_l;
5130        dst21_r = dst65_r;
5131        dst21_l = dst65_l;
5132        dsth2 = dsth6;
5133    }
5134
5135    src += 8;
5136    dst += 8;
5137
5138    mask2 = LD_SB(ff_hevc_mask_arr + 16);
5139    mask3 = mask2 + 2;
5140
5141    LD_SB3(src, src_stride, src0, src1, src2);
5142    src += (3 * src_stride);
5143    XORI_B3_128_SB(src0, src1, src2);
5144    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5145    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5146    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5147    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5148    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5149    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5150
5151    for (loop_cnt = 2; loop_cnt--;) {
5152        LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
5153               src10);
5154        src += (8 * src_stride);
5155        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5156        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5157        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5158        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5159        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5160        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5161        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5162        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5163        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5164        dst32_r = __msa_ilvr_h(dst73, dst22);
5165        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5166        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5167        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5168        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5169        dst76_r = __msa_ilvr_h(dst22, dst106);
5170        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5171        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5172        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5173        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5174        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5175        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5176        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5177        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5178        SRA_4V(dst0, dst1, dst2, dst3, 6);
5179        SRA_4V(dst4, dst5, dst6, dst7, 6);
5180        MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
5181        MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
5182        MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
5183        MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
5184        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5185        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5186        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5187                    tmp2, tmp3);
5188        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5189        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5190        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5191        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5192        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5193        dst += (8 * dst_stride);
5194
5195        dst10_r = dst98_r;
5196        dst21_r = dst109_r;
5197        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5198    }
5199}
5200
5201static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src,
5202                                      int32_t src_stride,
5203                                      uint8_t *dst,
5204                                      int32_t dst_stride,
5205                                      const int8_t *filter_x,
5206                                      const int8_t *filter_y,
5207                                      int32_t height,
5208                                      int32_t weight,
5209                                      int32_t offset,
5210                                      int32_t rnd_val)
5211{
5212    if (4 == height) {
5213        hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
5214                                      filter_x, filter_y, 2, weight, offset,
5215                                      rnd_val);
5216    } else {
5217        hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5218                                          filter_x, filter_y, height, weight,
5219                                          offset, rnd_val, 2);
5220    }
5221}
5222
5223static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src,
5224                                      int32_t src_stride,
5225                                      uint8_t *dst,
5226                                      int32_t dst_stride,
5227                                      const int8_t *filter_x,
5228                                      const int8_t *filter_y,
5229                                      int32_t height,
5230                                      int32_t weight,
5231                                      int32_t offset,
5232                                      int32_t rnd_val)
5233{
5234    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5235                                      filter_x, filter_y, height, weight,
5236                                      offset, rnd_val, 3);
5237}
5238
5239static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src,
5240                                      int32_t src_stride,
5241                                      uint8_t *dst,
5242                                      int32_t dst_stride,
5243                                      const int8_t *filter_x,
5244                                      const int8_t *filter_y,
5245                                      int32_t height,
5246                                      int32_t weight,
5247                                      int32_t offset,
5248                                      int32_t rnd_val)
5249{
5250    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5251                                      filter_x, filter_y, height, weight,
5252                                      offset, rnd_val, 4);
5253}
5254
5255#define UNIWGT_MC_COPY(WIDTH)                                                \
5256void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
5257                                                      ptrdiff_t dst_stride,  \
5258                                                      uint8_t *src,          \
5259                                                      ptrdiff_t src_stride,  \
5260                                                      int height,            \
5261                                                      int denom,             \
5262                                                      int weight,            \
5263                                                      int offset,            \
5264                                                      intptr_t mx,           \
5265                                                      intptr_t my,           \
5266                                                      int width)             \
5267{                                                                            \
5268    int shift = denom + 14 - 8;                                              \
5269    hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride,        \
5270                                    height, weight, offset, shift);          \
5271}
5272
5273UNIWGT_MC_COPY(4);
5274UNIWGT_MC_COPY(6);
5275UNIWGT_MC_COPY(8);
5276UNIWGT_MC_COPY(12);
5277UNIWGT_MC_COPY(16);
5278UNIWGT_MC_COPY(24);
5279UNIWGT_MC_COPY(32);
5280UNIWGT_MC_COPY(48);
5281UNIWGT_MC_COPY(64);
5282
5283#undef UNIWGT_MC_COPY
5284
5285#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                        \
5286void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,        \
5287                                                         ptrdiff_t            \
5288                                                         dst_stride,          \
5289                                                         uint8_t *src,        \
5290                                                         ptrdiff_t            \
5291                                                         src_stride,          \
5292                                                         int height,          \
5293                                                         int denom,           \
5294                                                         int weight,          \
5295                                                         int offset,          \
5296                                                         intptr_t mx,         \
5297                                                         intptr_t my,         \
5298                                                         int width)           \
5299{                                                                             \
5300    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \
5301    int shift = denom + 14 - 8;                                               \
5302                                                                              \
5303    hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,        \
5304                                                 dst_stride, filter, height,  \
5305                                                 weight, offset, shift);      \
5306}
5307
5308UNI_W_MC(qpel, h, 4, 8, hz, mx);
5309UNI_W_MC(qpel, h, 8, 8, hz, mx);
5310UNI_W_MC(qpel, h, 12, 8, hz, mx);
5311UNI_W_MC(qpel, h, 16, 8, hz, mx);
5312UNI_W_MC(qpel, h, 24, 8, hz, mx);
5313UNI_W_MC(qpel, h, 32, 8, hz, mx);
5314UNI_W_MC(qpel, h, 48, 8, hz, mx);
5315UNI_W_MC(qpel, h, 64, 8, hz, mx);
5316
5317UNI_W_MC(qpel, v, 4, 8, vt, my);
5318UNI_W_MC(qpel, v, 8, 8, vt, my);
5319UNI_W_MC(qpel, v, 12, 8, vt, my);
5320UNI_W_MC(qpel, v, 16, 8, vt, my);
5321UNI_W_MC(qpel, v, 24, 8, vt, my);
5322UNI_W_MC(qpel, v, 32, 8, vt, my);
5323UNI_W_MC(qpel, v, 48, 8, vt, my);
5324UNI_W_MC(qpel, v, 64, 8, vt, my);
5325
5326UNI_W_MC(epel, h, 4, 4, hz, mx);
5327UNI_W_MC(epel, h, 6, 4, hz, mx);
5328UNI_W_MC(epel, h, 8, 4, hz, mx);
5329UNI_W_MC(epel, h, 12, 4, hz, mx);
5330UNI_W_MC(epel, h, 16, 4, hz, mx);
5331UNI_W_MC(epel, h, 24, 4, hz, mx);
5332UNI_W_MC(epel, h, 32, 4, hz, mx);
5333
5334UNI_W_MC(epel, v, 4, 4, vt, my);
5335UNI_W_MC(epel, v, 6, 4, vt, my);
5336UNI_W_MC(epel, v, 8, 4, vt, my);
5337UNI_W_MC(epel, v, 12, 4, vt, my);
5338UNI_W_MC(epel, v, 16, 4, vt, my);
5339UNI_W_MC(epel, v, 24, 4, vt, my);
5340UNI_W_MC(epel, v, 32, 4, vt, my);
5341
5342#undef UNI_W_MC
5343
5344#define UNI_W_MC_HV(PEL, WIDTH, TAP)                                          \
5345void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,           \
5346                                                      ptrdiff_t dst_stride,   \
5347                                                      uint8_t *src,           \
5348                                                      ptrdiff_t src_stride,   \
5349                                                      int height,             \
5350                                                      int denom,              \
5351                                                      int weight,             \
5352                                                      int offset,             \
5353                                                      intptr_t mx,            \
5354                                                      intptr_t my,            \
5355                                                      int width)              \
5356{                                                                             \
5357    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                 \
5358    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                 \
5359    int shift = denom + 14 - 8;                                               \
5360                                                                              \
5361    hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
5362                                           filter_x, filter_y,  height,       \
5363                                           weight, offset, shift);            \
5364}
5365
5366UNI_W_MC_HV(qpel, 4, 8);
5367UNI_W_MC_HV(qpel, 8, 8);
5368UNI_W_MC_HV(qpel, 12, 8);
5369UNI_W_MC_HV(qpel, 16, 8);
5370UNI_W_MC_HV(qpel, 24, 8);
5371UNI_W_MC_HV(qpel, 32, 8);
5372UNI_W_MC_HV(qpel, 48, 8);
5373UNI_W_MC_HV(qpel, 64, 8);
5374
5375UNI_W_MC_HV(epel, 4, 4);
5376UNI_W_MC_HV(epel, 6, 4);
5377UNI_W_MC_HV(epel, 8, 4);
5378UNI_W_MC_HV(epel, 12, 4);
5379UNI_W_MC_HV(epel, 16, 4);
5380UNI_W_MC_HV(epel, 24, 4);
5381UNI_W_MC_HV(epel, 32, 4);
5382
5383#undef UNI_W_MC_HV
5384