1/*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "libavcodec/mips/hevcdsp_mips.h"
23#include "libavcodec/mips/hevc_macros_msa.h"
24
25static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26    /* 8 width cases */
27    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29};
30
31#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,  \
32                           out0, out1)                              \
33{                                                                   \
34    v4i32 out0_r, out1_r, out0_l, out1_l;                           \
35                                                                    \
36    ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);               \
37    ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);               \
38                                                                    \
39    out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);  \
40    out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);  \
41    out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);  \
42    out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);  \
43                                                                    \
44    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                \
45    PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);        \
46    CLIP_SH2_0_255(out0, out1);                                     \
47}
48
49#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3,       \
50                           wgt, rnd, offset, out0, out1, out2, out3)         \
51{                                                                            \
52    HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1);  \
53    HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3);  \
54}
55
56#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd,  \
57                                    offset, out0, out1)              \
58{                                                                    \
59    v4i32 out0_r, out1_r, out0_l, out1_l;                            \
60                                                                     \
61    ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);                \
62    ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);                \
63    out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);   \
64    out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);   \
65    out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);   \
66    out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);   \
67    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \
68    PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);         \
69    CLIP_SH2_0_255(out0, out1);                                      \
70}
71
72#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,  \
73                                    vec3, wgt, rnd, offset, out0, out1,    \
74                                    out2, out3)                            \
75{                                                                          \
76    HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset,    \
77                                out0, out1);                               \
78    HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset,    \
79                                out2, out3);                               \
80}
81
82static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
83                                   int32_t src_stride,
84                                   int16_t *src1_ptr,
85                                   int32_t src2_stride,
86                                   uint8_t *dst,
87                                   int32_t dst_stride,
88                                   int32_t height,
89                                   int32_t weight0,
90                                   int32_t weight1,
91                                   int32_t offset0,
92                                   int32_t offset1,
93                                   int32_t rnd_val)
94{
95    uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96    uint64_t tpd0, tpd1, tpd2, tpd3;
97    int32_t offset, weight;
98    v16u8 out0, out1;
99    v16i8 zero = { 0 };
100    v16i8 src0 = { 0 }, src1 = { 0 };
101    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102    v8i16 dst0, dst1, dst2, dst3, weight_vec;
103    v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
104
105    offset = (offset0 + offset1) << rnd_val;
106    weight0 = weight0 & 0x0000FFFF;
107    weight = weight0 | (weight1 << 16);
108
109    offset_vec = __msa_fill_w(offset);
110    weight_vec = (v8i16) __msa_fill_w(weight);
111    rnd_vec = __msa_fill_w(rnd_val + 1);
112
113    if (2 == height) {
114        LW2(src0_ptr, src_stride, tp0, tp1);
115        INSERT_W2_SB(tp0, tp1, src0);
116        LD2(src1_ptr, src2_stride, tpd0, tpd1);
117        INSERT_D2_SH(tpd0, tpd1, in0);
118
119        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
120        dst0 <<= 6;
121
122        ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123        dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124        dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126        dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
127        CLIP_SH_0_255(dst0);
128        out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129        ST_W2(out0, 0, 1, dst, dst_stride);
130    } else if (4 == height) {
131        LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132        INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133        LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134        INSERT_D2_SH(tpd0, tpd1, in0);
135        INSERT_D2_SH(tpd2, tpd3, in1);
136        ILVRL_B2_SH(zero, src0, dst0, dst1);
137        SLLI_2V(dst0, dst1, 6);
138        HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139                                    offset_vec, dst0, dst1);
140        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141        ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142    } else if (0 == height % 8) {
143        for (loop_cnt = (height >> 3); loop_cnt--;) {
144            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145            src0_ptr += 4 * src_stride;
146            INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148            src0_ptr += 4 * src_stride;
149            INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151            src1_ptr += (4 * src2_stride);
152            INSERT_D2_SH(tpd0, tpd1, in0);
153            INSERT_D2_SH(tpd2, tpd3, in1);
154            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155            src1_ptr += (4 * src2_stride);
156            INSERT_D2_SH(tpd0, tpd1, in2);
157            INSERT_D2_SH(tpd2, tpd3, in3);
158            ILVRL_B2_SH(zero, src0, dst0, dst1);
159            ILVRL_B2_SH(zero, src1, dst2, dst3);
160            SLLI_4V(dst0, dst1, dst2, dst3, 6);
161            HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162                                        in3, weight_vec, rnd_vec, offset_vec,
163                                        dst0, dst1, dst2, dst3);
164            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165            ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166            dst += (8 * dst_stride);
167        }
168    }
169}
170
171static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
172                                   int32_t src_stride,
173                                   int16_t *src1_ptr,
174                                   int32_t src2_stride,
175                                   uint8_t *dst,
176                                   int32_t dst_stride,
177                                   int32_t height,
178                                   int32_t weight0,
179                                   int32_t weight1,
180                                   int32_t offset0,
181                                   int32_t offset1,
182                                   int32_t rnd_val)
183{
184    uint32_t loop_cnt;
185    int32_t offset, weight;
186    uint64_t tp0, tp1, tp2, tp3;
187    v16u8 out0, out1;
188    v16i8 zero = { 0 };
189    v16i8 src0 = { 0 }, src1 = { 0 };
190    v8i16 in0, in1, in2, in3;
191    v8i16 dst0, dst1, dst2, dst3;
192    v4i32 offset_vec, weight_vec, rnd_vec;
193
194    offset = (offset0 + offset1) << rnd_val;
195    weight0 = weight0 & 0x0000FFFF;
196    weight = weight0 | (weight1 << 16);
197
198    weight_vec = __msa_fill_w(weight);
199    offset_vec = __msa_fill_w(offset);
200    rnd_vec = __msa_fill_w(rnd_val + 1);
201
202    for (loop_cnt = (height >> 2); loop_cnt--;) {
203        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204        src0_ptr += (4 * src_stride);
205        INSERT_D2_SB(tp0, tp1, src0);
206        INSERT_D2_SB(tp2, tp3, src1);
207        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208        src1_ptr += (4 * src2_stride);
209        ILVRL_B2_SH(zero, src0, dst0, dst1);
210        ILVRL_B2_SH(zero, src1, dst2, dst3);
211        SLLI_4V(dst0, dst1, dst2, dst3, 6);
212        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
213                                    in0, in1, in2, in3,
214                                    weight_vec, rnd_vec, offset_vec,
215                                    dst0, dst1, dst2, dst3);
216        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
217        ST_W2(out0, 0, 2, dst, dst_stride);
218        ST_H2(out0, 2, 6, dst + 4, dst_stride);
219        ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
220        ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
221        dst += (4 * dst_stride);
222    }
223}
224
225static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
226                                   int32_t src_stride,
227                                   int16_t *src1_ptr,
228                                   int32_t src2_stride,
229                                   uint8_t *dst,
230                                   int32_t dst_stride,
231                                   int32_t height,
232                                   int32_t weight0,
233                                   int32_t weight1,
234                                   int32_t offset0,
235                                   int32_t offset1,
236                                   int32_t rnd_val)
237{
238    uint64_t tp0, tp1, tp2, tp3;
239    int32_t offset, weight;
240    v16u8 out0, out1, out2;
241    v16i8 zero = { 0 };
242    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
243    v8i16 in0, in1, in2, in3, in4, in5;
244    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
245    v4i32 offset_vec, weight_vec, rnd_vec;
246
247    offset = (offset0 + offset1) << rnd_val;
248    weight0 = weight0 & 0x0000FFFF;
249    weight = weight0 | (weight1 << 16);
250
251    offset_vec = __msa_fill_w(offset);
252    weight_vec = __msa_fill_w(weight);
253    rnd_vec = __msa_fill_w(rnd_val + 1);
254
255    if (2 == height) {
256        LD2(src0_ptr, src_stride, tp0, tp1);
257        INSERT_D2_SB(tp0, tp1, src0);
258        LD_SH2(src1_ptr, src2_stride, in0, in1);
259        ILVRL_B2_SH(zero, src0, dst0, dst1);
260        SLLI_2V(dst0, dst1, 6);
261
262        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
263                           weight_vec, rnd_vec, offset_vec,
264                           dst0, dst1);
265
266        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
267        ST_D2(out0, 0, 1, dst, dst_stride);
268    } else if (6 == height) {
269        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
270        src0_ptr += 4 * src_stride;
271        INSERT_D2_SB(tp0, tp1, src0);
272        INSERT_D2_SB(tp2, tp3, src1);
273        LD2(src0_ptr, src_stride, tp0, tp1);
274        INSERT_D2_SB(tp0, tp1, src2);
275        ILVRL_B2_SH(zero, src0, dst0, dst1);
276        ILVRL_B2_SH(zero, src1, dst2, dst3);
277        ILVRL_B2_SH(zero, src2, dst4, dst5);
278        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279        SLLI_4V(dst0, dst1, dst2, dst3, 6);
280        SLLI_2V(dst4, dst5, 6);
281        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
282                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
283                                    dst2, dst3);
284        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
285                                    offset_vec, dst4, dst5);
286        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
287        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
288        ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
289    } else if (0 == height % 4) {
290        uint32_t loop_cnt;
291
292        for (loop_cnt = (height >> 2); loop_cnt--;) {
293            LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
294            src0_ptr += (4 * src_stride);
295            INSERT_D2_SB(tp0, tp1, src0);
296            INSERT_D2_SB(tp2, tp3, src1);
297            ILVRL_B2_SH(zero, src0, dst0, dst1);
298            ILVRL_B2_SH(zero, src1, dst2, dst3);
299            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
300            src1_ptr += (4 * src2_stride);
301
302            SLLI_4V(dst0, dst1, dst2, dst3, 6);
303            HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
304                                        in3, weight_vec, rnd_vec, offset_vec,
305                                        dst0, dst1, dst2, dst3);
306            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
307            ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
308            dst += (4 * dst_stride);
309        }
310    }
311}
312
313static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
314                                    int32_t src_stride,
315                                    int16_t *src1_ptr,
316                                    int32_t src2_stride,
317                                    uint8_t *dst,
318                                    int32_t dst_stride,
319                                    int32_t height,
320                                    int32_t weight0,
321                                    int32_t weight1,
322                                    int32_t offset0,
323                                    int32_t offset1,
324                                    int32_t rnd_val)
325{
326    uint32_t loop_cnt;
327    int32_t offset, weight;
328    v16i8 zero = { 0 };
329    v16u8 out0, out1, out2;
330    v16i8 src0, src1, src2, src3;
331    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
332    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
333    v4i32 offset_vec, weight_vec, rnd_vec;
334
335    offset = (offset0 + offset1) << rnd_val;
336    weight0 = weight0 & 0x0000FFFF;
337    weight = weight0 | (weight1 << 16);
338
339    offset_vec = __msa_fill_w(offset);
340    weight_vec = __msa_fill_w(weight);
341    rnd_vec = __msa_fill_w(rnd_val + 1);
342
343    for (loop_cnt = (16 >> 2); loop_cnt--;) {
344        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
345        src0_ptr += (4 * src_stride);
346        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
347        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
348        src1_ptr += (4 * src2_stride);
349
350        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
351        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
352                   dst0, dst1, dst2, dst3);
353
354        SLLI_4V(dst0, dst1, dst2, dst3, 6);
355        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
356        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
357
358        dst4 <<= 6;
359        dst5 <<= 6;
360        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
361                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
362                                    dst2, dst3);
363        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
364                                    offset_vec, dst4, dst5);
365        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
366        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
367        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
368        dst += (4 * dst_stride);
369    }
370}
371
372static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
373                                    int32_t src_stride,
374                                    int16_t *src1_ptr,
375                                    int32_t src2_stride,
376                                    uint8_t *dst,
377                                    int32_t dst_stride,
378                                    int32_t height,
379                                    int32_t weight0,
380                                    int32_t weight1,
381                                    int32_t offset0,
382                                    int32_t offset1,
383                                    int32_t rnd_val)
384{
385    uint32_t loop_cnt;
386    int32_t offset, weight;
387    v16u8 out0, out1, out2, out3;
388    v16i8 zero = { 0 };
389    v16i8 src0, src1, src2, src3;
390    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
391    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
392    v4i32 offset_vec, weight_vec, rnd_vec;
393
394    offset = (offset0 + offset1) << rnd_val;
395    weight0 = weight0 & 0x0000FFFF;
396    weight = weight0 | (weight1 << 16);
397
398    offset_vec = __msa_fill_w(offset);
399    weight_vec = __msa_fill_w(weight);
400    rnd_vec = __msa_fill_w(rnd_val + 1);
401
402    for (loop_cnt = (height >> 2); loop_cnt--;) {
403        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
404        src0_ptr += (4 * src_stride);
405        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
406        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
407        src1_ptr += (4 * src2_stride);
408        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
409                   tmp2, tmp3);
410        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
411                   tmp6, tmp7);
412        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
413        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
414        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
415                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
416                                    tmp4, tmp5);
417        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
418                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
419                                    tmp6, tmp7);
420        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
421        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
422        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
423        dst += (4 * dst_stride);
424    }
425}
426
427static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
428                                    int32_t src_stride,
429                                    int16_t *src1_ptr,
430                                    int32_t src2_stride,
431                                    uint8_t *dst,
432                                    int32_t dst_stride,
433                                    int32_t height,
434                                    int32_t weight0,
435                                    int32_t weight1,
436                                    int32_t offset0,
437                                    int32_t offset1,
438                                    int32_t rnd_val)
439{
440    uint32_t loop_cnt;
441    int32_t offset, weight;
442    v16u8 out0, out1, out2, out3, out4, out5;
443    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
444    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
445    v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
446    v4i32 offset_vec, weight_vec, rnd_vec;
447
448    offset = (offset0 + offset1) << rnd_val;
449    weight0 = weight0 & 0x0000FFFF;
450    weight = weight0 | (weight1 << 16);
451
452    offset_vec = __msa_fill_w(offset);
453    weight_vec = __msa_fill_w(weight);
454    rnd_vec = __msa_fill_w(rnd_val + 1);
455
456    for (loop_cnt = 8; loop_cnt--;) {
457        LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
458        LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
459        src0_ptr += (4 * src_stride);
460        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
461        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
462        LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
463        src1_ptr += (4 * src2_stride);
464
465        ILVRL_B2_SH(zero, src0, dst0, dst1);
466        ILVRL_B2_SH(zero, src1, dst2, dst3);
467        ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
468        ILVRL_B2_SH(zero, src4, dst6, dst7);
469        ILVRL_B2_SH(zero, src5, dst8, dst9);
470        ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
471        SLLI_4V(dst0, dst1, dst2, dst3, 6);
472        SLLI_4V(dst4, dst5, dst6, dst7, 6);
473        SLLI_4V(dst8, dst9, dst10, dst11, 6);
474        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
475                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
476                                    dst2, dst3);
477        HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
478                                    weight_vec, rnd_vec, offset_vec, dst4, dst5,
479                                    dst6, dst7);
480        HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
481                                    in11, weight_vec, rnd_vec, offset_vec,
482                                    dst8, dst9, dst10, dst11);
483        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
484        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
485        ST_UB4(out0, out1, out3, out4, dst, dst_stride);
486        ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
487        dst += (4 * dst_stride);
488    }
489}
490
491static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
492                                    int32_t src_stride,
493                                    int16_t *src1_ptr,
494                                    int32_t src2_stride,
495                                    uint8_t *dst,
496                                    int32_t dst_stride,
497                                    int32_t height,
498                                    int32_t weight0,
499                                    int32_t weight1,
500                                    int32_t offset0,
501                                    int32_t offset1,
502                                    int32_t rnd_val)
503{
504    uint32_t loop_cnt;
505    int32_t offset, weight;
506    v16u8 out0, out1, out2, out3;
507    v16i8 zero = { 0 };
508    v16i8 src0, src1, src2, src3;
509    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
510    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
511    v4i32 offset_vec, weight_vec, rnd_vec;
512
513    offset = (offset0 + offset1) << rnd_val;
514    weight0 = weight0 & 0x0000FFFF;
515    weight = weight0 | (weight1 << 16);
516
517    offset_vec = __msa_fill_w(offset);
518    weight_vec = __msa_fill_w(weight);
519    rnd_vec = __msa_fill_w(rnd_val + 1);
520
521    for (loop_cnt = (height >> 1); loop_cnt--;) {
522        LD_SB2(src0_ptr, 16, src0, src1);
523        src0_ptr += src_stride;
524        LD_SB2(src0_ptr, 16, src2, src3);
525        src0_ptr += src_stride;
526        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
527        src1_ptr += src2_stride;
528        LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
529        src1_ptr += src2_stride;
530
531        ILVRL_B2_SH(zero, src0, tmp0, tmp4);
532        ILVRL_B2_SH(zero, src1, tmp1, tmp5);
533        ILVRL_B2_SH(zero, src2, tmp2, tmp6);
534        ILVRL_B2_SH(zero, src3, tmp3, tmp7);
535        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
536        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
537        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
538                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
539                                    tmp1, tmp5);
540        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
541                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
542                                    tmp3, tmp7);
543        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
544        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
545        ST_UB2(out0, out1, dst, 16);
546        dst += dst_stride;
547        ST_UB2(out2, out3, dst, 16);
548        dst += dst_stride;
549    }
550}
551
552static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
553                                    int32_t src_stride,
554                                    int16_t *src1_ptr,
555                                    int32_t src2_stride,
556                                    uint8_t *dst,
557                                    int32_t dst_stride,
558                                    int32_t height,
559                                    int32_t weight0,
560                                    int32_t weight1,
561                                    int32_t offset0,
562                                    int32_t offset1,
563                                    int32_t rnd_val)
564{
565    uint32_t loop_cnt;
566    int32_t offset, weight;
567    v16u8 out0, out1, out2;
568    v16i8 src0, src1, src2;
569    v16i8 zero = { 0 };
570    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
571    v4i32 offset_vec, weight_vec, rnd_vec;
572
573    offset = (offset0 + offset1) << rnd_val;
574    weight0 = weight0 & 0x0000FFFF;
575    weight = weight0 | (weight1 << 16);
576
577    offset_vec = __msa_fill_w(offset);
578    weight_vec = __msa_fill_w(weight);
579    rnd_vec = __msa_fill_w(rnd_val + 1);
580
581    for (loop_cnt = 64; loop_cnt--;) {
582        LD_SB3(src0_ptr, 16, src0, src1, src2);
583        src0_ptr += src_stride;
584        LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
585        src1_ptr += src2_stride;
586
587        ILVRL_B2_SH(zero, src0, dst0, dst1);
588        ILVRL_B2_SH(zero, src1, dst2, dst3);
589        ILVRL_B2_SH(zero, src2, dst4, dst5);
590        SLLI_4V(dst0, dst1, dst2, dst3, 6);
591        SLLI_2V(dst4, dst5, 6);
592        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
593                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
594                                    dst2, dst3);
595        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
596                                    offset_vec, dst4, dst5);
597        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
598        ST_UB2(out0, out1, dst, 16);
599        ST_UB(out2, dst + 32);
600        dst += dst_stride;
601    }
602}
603
604static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
605                                    int32_t src_stride,
606                                    int16_t *src1_ptr,
607                                    int32_t src2_stride,
608                                    uint8_t *dst,
609                                    int32_t dst_stride,
610                                    int32_t height,
611                                    int32_t weight0,
612                                    int32_t weight1,
613                                    int32_t offset0,
614                                    int32_t offset1,
615                                    int32_t rnd_val)
616{
617    uint32_t loop_cnt;
618    int32_t offset, weight;
619    v16u8 out0, out1, out2, out3;
620    v16i8 zero = { 0 };
621    v16i8 src0, src1, src2, src3;
622    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
623    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
624    v4i32 offset_vec, weight_vec, rnd_vec;
625
626    offset = (offset0 + offset1) << rnd_val;
627    weight0 = weight0 & 0x0000FFFF;
628    weight = weight0 | (weight1 << 16);
629
630    offset_vec = __msa_fill_w(offset);
631    weight_vec = __msa_fill_w(weight);
632    rnd_vec = __msa_fill_w(rnd_val + 1);
633
634    for (loop_cnt = height; loop_cnt--;) {
635        LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
636        src0_ptr += src_stride;
637        LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
638        src1_ptr += src2_stride;
639
640        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
641                   tmp2, tmp3);
642        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
643                   tmp6, tmp7);
644        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
645        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
646        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
647                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
648                                    tmp1, tmp5);
649        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
650                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
651                                    tmp3, tmp7);
652        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
653        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
654        ST_UB4(out0, out1, out2, out3, dst, 16);
655        dst += dst_stride;
656    }
657}
658
659static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
660                                    int32_t src_stride,
661                                    int16_t *src1_ptr,
662                                    int32_t src2_stride,
663                                    uint8_t *dst,
664                                    int32_t dst_stride,
665                                    const int8_t *filter,
666                                    int32_t height,
667                                    int32_t weight0,
668                                    int32_t weight1,
669                                    int32_t offset0,
670                                    int32_t offset1,
671                                    int32_t rnd_val)
672{
673    uint32_t loop_cnt;
674    int32_t offset, weight, constant;
675    v8i16 filt0, filt1, filt2, filt3;
676    v16i8 src0, src1, src2, src3;
677    v16i8 mask1, mask2, mask3;
678    v16i8 vec0, vec1, vec2, vec3;
679    v8i16 dst0, dst1;
680    v8i16 in0, in1, in2, in3;
681    v8i16 filter_vec, out0, out1;
682    v4i32 weight_vec, offset_vec, rnd_vec;
683    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
684
685    src0_ptr -= 3;
686    filter_vec = LD_SH(filter);
687    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
688
689    mask1 = mask0 + 2;
690    mask2 = mask0 + 4;
691    mask3 = mask0 + 6;
692
693    offset = (offset0 + offset1) << rnd_val;
694    weight0 = weight0 & 0x0000FFFF;
695    weight = weight0 | (weight1 << 16);
696    constant = 128 * weight1;
697    constant <<= 6;
698    offset += constant;
699
700    offset_vec = __msa_fill_w(offset);
701    weight_vec = __msa_fill_w(weight);
702    rnd_vec = __msa_fill_w(rnd_val + 1);
703
704    for (loop_cnt = (height >> 2); loop_cnt--;) {
705        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
706        src0_ptr += (4 * src_stride);
707        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
708        src1_ptr += (4 * src2_stride);
709        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
710        XORI_B4_128_SB(src0, src1, src2, src3);
711
712        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
713                   vec0, vec1, vec2, vec3);
714        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
715                                 filt3);
716        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
717                   vec0, vec1, vec2, vec3);
718        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
719                                 filt3);
720
721        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
722                           weight_vec, rnd_vec, offset_vec,
723                           out0, out1);
724
725        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
726        ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
727        dst += (4 * dst_stride);
728    }
729}
730
731static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
732                                    int32_t src_stride,
733                                    int16_t *src1_ptr,
734                                    int32_t src2_stride,
735                                    uint8_t *dst,
736                                    int32_t dst_stride,
737                                    const int8_t *filter,
738                                    int32_t height,
739                                    int32_t weight0,
740                                    int32_t weight1,
741                                    int32_t offset0,
742                                    int32_t offset1,
743                                    int32_t rnd_val)
744{
745    uint32_t loop_cnt;
746    int32_t offset, weight, constant;
747    v8i16 filt0, filt1, filt2, filt3;
748    v16i8 src0, src1, src2, src3;
749    v16i8 mask1, mask2, mask3;
750    v16i8 vec0, vec1, vec2, vec3;
751    v8i16 dst0, dst1, dst2, dst3;
752    v8i16 in0, in1, in2, in3;
753    v8i16 filter_vec, out0, out1, out2, out3;
754    v4i32 weight_vec, offset_vec, rnd_vec;
755    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
756
757    src0_ptr -= 3;
758    offset = (offset0 + offset1) << rnd_val;
759    weight0 = weight0 & 0x0000FFFF;
760    weight = weight0 | (weight1 << 16);
761    constant = 128 * weight1;
762    constant <<= 6;
763    offset += constant;
764
765    offset_vec = __msa_fill_w(offset);
766    weight_vec = __msa_fill_w(weight);
767    rnd_vec = __msa_fill_w(rnd_val + 1);
768
769    filter_vec = LD_SH(filter);
770    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
771
772    mask1 = mask0 + 2;
773    mask2 = mask0 + 4;
774    mask3 = mask0 + 6;
775
776    for (loop_cnt = (height >> 2); loop_cnt--;) {
777        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
778        src0_ptr += (4 * src_stride);
779        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
780        src1_ptr += (4 * src2_stride);
781        XORI_B4_128_SB(src0, src1, src2, src3);
782
783        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
784                   vec0, vec1, vec2, vec3);
785        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
786                                 filt3);
787        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
788                   vec0, vec1, vec2, vec3);
789        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
790                                 filt3);
791        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
792                   vec0, vec1, vec2, vec3);
793        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
794                                 filt3);
795        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
796                   vec0, vec1, vec2, vec3);
797        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
798                                 filt3);
799
800        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
801                           in0, in1, in2, in3,
802                           weight_vec, rnd_vec, offset_vec,
803                           out0, out1, out2, out3);
804
805        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
806        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
807        dst += (4 * dst_stride);
808    }
809}
810
811static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
812                                     int32_t src_stride,
813                                     int16_t *src1_ptr,
814                                     int32_t src2_stride,
815                                     uint8_t *dst,
816                                     int32_t dst_stride,
817                                     const int8_t *filter,
818                                     int32_t height,
819                                     int32_t weight0,
820                                     int32_t weight1,
821                                     int32_t offset0,
822                                     int32_t offset1,
823                                     int32_t rnd_val)
824{
825    uint32_t loop_cnt;
826    int32_t offset, weight, constant;
827    v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
828    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829    v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
830    v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
831    v4i32 weight_vec, offset_vec, rnd_vec;
832
833    src0_ptr -= 3;
834
835    weight0 = weight0 & 0x0000FFFF;
836    weight = weight0 | (weight1 << 16);
837    constant = 128 * weight1;
838    constant <<= 6;
839    offset = (offset0 + offset1) << rnd_val;
840    offset += constant;
841
842    offset_vec = __msa_fill_w(offset);
843    weight_vec = __msa_fill_w(weight);
844    rnd_vec = __msa_fill_w(rnd_val + 1);
845
846    filter_vec = LD_SH(filter);
847    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
848
849    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
850    mask1 = mask0 + 2;
851    mask2 = mask0 + 4;
852    mask3 = mask0 + 6;
853    mask4 = LD_SB(&ff_hevc_mask_arr[16]);
854    mask5 = mask4 + 2;
855    mask6 = mask4 + 4;
856    mask7 = mask4 + 6;
857
858    for (loop_cnt = 4; loop_cnt--;) {
859        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
860        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
861        XORI_B4_128_SB(src0, src1, src2, src3);
862        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
863                   vec3);
864        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
865                                 filt3);
866        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
867                   vec3);
868        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
869                                 filt3);
870        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
871                   vec3);
872        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
873                                 filt3);
874        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
875                   vec3);
876        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
877                                 filt3);
878        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
879                           weight_vec, rnd_vec, offset_vec, out0, out1, out2,
880                           out3);
881        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
882        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
883
884        LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
885        src0_ptr += (4 * src_stride);
886        LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
887        src1_ptr += (4 * src2_stride);
888        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
889        XORI_B4_128_SB(src0, src1, src2, src3);
890        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
891                   vec3);
892        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
893                                 filt3);
894        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
895                   vec3);
896        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
897                                 filt3);
898        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
899                           offset_vec, out0, out1);
900        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
901        ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
902        dst += (4 * dst_stride);
903    }
904}
905
906static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
907                                     int32_t src_stride,
908                                     int16_t *src1_ptr,
909                                     int32_t src2_stride,
910                                     uint8_t *dst,
911                                     int32_t dst_stride,
912                                     const int8_t *filter,
913                                     int32_t height,
914                                     int32_t weight0,
915                                     int32_t weight1,
916                                     int32_t offset0,
917                                     int32_t offset1,
918                                     int32_t rnd_val)
919{
920    uint32_t loop_cnt;
921    int32_t offset, weight, constant;
922    v16i8 src0, src1, src2, src3;
923    v8i16 in0, in1, in2, in3;
924    v8i16 filt0, filt1, filt2, filt3;
925    v16i8 mask1, mask2, mask3;
926    v8i16 filter_vec, out0, out1, out2, out3;
927    v16i8 vec0, vec1, vec2, vec3;
928    v8i16 dst0, dst1, dst2, dst3;
929    v4i32 weight_vec, offset_vec, rnd_vec;
930    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
931
932    src0_ptr -= 3;
933    offset = (offset0 + offset1) << rnd_val;
934    weight0 = weight0 & 0x0000FFFF;
935    weight = weight0 | (weight1 << 16);
936    constant = 128 * weight1;
937    constant <<= 6;
938    offset += constant;
939
940    offset_vec = __msa_fill_w(offset);
941    weight_vec = __msa_fill_w(weight);
942    rnd_vec = __msa_fill_w(rnd_val + 1);
943
944    filter_vec = LD_SH(filter);
945    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
946
947    mask1 = mask0 + 2;
948    mask2 = mask0 + 4;
949    mask3 = mask0 + 6;
950
951    for (loop_cnt = (height >> 1); loop_cnt--;) {
952        LD_SB2(src0_ptr, 8, src0, src1);
953        src0_ptr += src_stride;
954        LD_SB2(src0_ptr, 8, src2, src3);
955        src0_ptr += src_stride;
956        LD_SH2(src1_ptr, 8, in0, in1);
957        src1_ptr += src2_stride;
958        LD_SH2(src1_ptr, 8, in2, in3);
959        src1_ptr += src2_stride;
960        XORI_B4_128_SB(src0, src1, src2, src3);
961
962        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
963                   vec0, vec1, vec2, vec3);
964        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
965                                 filt3);
966        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
967                   vec0, vec1, vec2, vec3);
968        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
969                                 filt3);
970        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
971                   vec0, vec1, vec2, vec3);
972        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
973                                 filt3);
974        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
975                   vec0, vec1, vec2, vec3);
976        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
977                                 filt3);
978
979        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
980                           in0, in1, in2, in3,
981                           weight_vec, rnd_vec, offset_vec,
982                           out0, out1, out2, out3);
983
984        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
985        ST_SH2(out0, out1, dst, dst_stride);
986        dst += (2 * dst_stride);
987    }
988}
989
990static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
991                                     int32_t src_stride,
992                                     int16_t *src1_ptr,
993                                     int32_t src2_stride,
994                                     uint8_t *dst,
995                                     int32_t dst_stride,
996                                     const int8_t *filter,
997                                     int32_t height,
998                                     int32_t weight0,
999                                     int32_t weight1,
1000                                     int32_t offset0,
1001                                     int32_t offset1,
1002                                     int32_t rnd_val)
1003{
1004    uint32_t loop_cnt;
1005    uint64_t dst_val0;
1006    int32_t offset, weight, constant;
1007    v16i8 src0, src1;
1008    v8i16 in0, in1, in2;
1009    v8i16 filt0, filt1, filt2, filt3;
1010    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1011    v16i8 vec0, vec1, vec2, vec3;
1012    v8i16 dst0, dst1, dst2;
1013    v4i32 dst2_r, dst2_l;
1014    v8i16 filter_vec, out0, out1, out2;
1015    v4i32 weight_vec, offset_vec, rnd_vec;
1016    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1017
1018    src0_ptr = src0_ptr - 3;
1019    offset = (offset0 + offset1) << rnd_val;
1020    weight0 = weight0 & 0x0000FFFF;
1021    weight = weight0 | (weight1 << 16);
1022    constant = 128 * weight1;
1023    constant <<= 6;
1024    offset += constant;
1025
1026    offset_vec = __msa_fill_w(offset);
1027    weight_vec = __msa_fill_w(weight);
1028    rnd_vec = __msa_fill_w(rnd_val + 1);
1029
1030    filter_vec = LD_SH(filter);
1031    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1032
1033    mask1 = mask0 + 2;
1034    mask2 = mask0 + 4;
1035    mask3 = mask0 + 6;
1036    mask4 = mask0 + 8;
1037    mask5 = mask0 + 10;
1038    mask6 = mask0 + 12;
1039    mask7 = mask0 + 14;
1040
1041    LD_SB2(src0_ptr, 16, src0, src1);
1042    src0_ptr += src_stride;
1043    LD_SH2(src1_ptr, 8, in0, in1);
1044    in2 = LD_SH(src1_ptr + 16);
1045    src1_ptr += src2_stride;
1046    XORI_B2_128_SB(src0, src1);
1047
1048    for (loop_cnt = 31; loop_cnt--;) {
1049        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1050                   vec0, vec1, vec2, vec3);
1051        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1052                                 filt3);
1053        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1054                   vec0, vec1, vec2, vec3);
1055        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1056                                 filt3);
1057        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1058                   vec0, vec1, vec2, vec3);
1059        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1060                                 filt3);
1061
1062        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1063                           weight_vec, rnd_vec, offset_vec,
1064                           out0, out1);
1065
1066        ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1067        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1068                                 (v8i16) weight_vec);
1069        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1070                                 (v8i16) weight_vec);
1071        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1072        out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1073        CLIP_SH_0_255(out2);
1074
1075        LD_SB2(src0_ptr, 16, src0, src1);
1076        src0_ptr += src_stride;
1077        LD_SH2(src1_ptr, 8, in0, in1);
1078        in2 = LD_SH(src1_ptr + 16);
1079        src1_ptr += src2_stride;
1080        XORI_B2_128_SB(src0, src1);
1081        PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1082        dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1083        ST_SH(out0, dst);
1084        SD(dst_val0, dst + 16);
1085        dst += dst_stride;
1086    }
1087
1088    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1089    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1090                             filt3);
1091    VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1092    dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1093                             filt3);
1094    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1095    dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1096                             filt3);
1097    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1098                       out0, out1);
1099    ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1100    dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1101    dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1102    SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1103    out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1104    CLIP_SH_0_255(out2);
1105    PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1106    dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1107    ST_SH(out0, dst);
1108    SD(dst_val0, dst + 16);
1109    dst += dst_stride;
1110}
1111
1112static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1113                                     int32_t src_stride,
1114                                     int16_t *src1_ptr,
1115                                     int32_t src2_stride,
1116                                     uint8_t *dst,
1117                                     int32_t dst_stride,
1118                                     const int8_t *filter,
1119                                     int32_t height,
1120                                     int32_t weight0,
1121                                     int32_t weight1,
1122                                     int32_t offset0,
1123                                     int32_t offset1,
1124                                     int32_t rnd_val)
1125{
1126    uint32_t loop_cnt;
1127    int32_t offset, weight, constant;
1128    v16i8 src0, src1, src2;
1129    v8i16 in0, in1, in2, in3;
1130    v8i16 filt0, filt1, filt2, filt3;
1131    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1132    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1133    v16i8 vec0, vec1, vec2, vec3;
1134    v8i16 dst0, dst1, dst2, dst3;
1135    v8i16 filter_vec, out0, out1, out2, out3;
1136    v4i32 weight_vec, offset_vec, rnd_vec;
1137
1138    src0_ptr -= 3;
1139    offset = (offset0 + offset1) << rnd_val;
1140    weight0 = weight0 & 0x0000FFFF;
1141    weight = weight0 | (weight1 << 16);
1142    constant = 128 * weight1;
1143    constant <<= 6;
1144    offset += constant;
1145
1146    offset_vec = __msa_fill_w(offset);
1147    weight_vec = __msa_fill_w(weight);
1148    rnd_vec = __msa_fill_w(rnd_val + 1);
1149
1150    filter_vec = LD_SH(filter);
1151    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1152
1153    mask1 = mask0 + 2;
1154    mask2 = mask0 + 4;
1155    mask3 = mask0 + 6;
1156    mask4 = mask0 + 8;
1157    mask5 = mask0 + 10;
1158    mask6 = mask0 + 12;
1159    mask7 = mask0 + 14;
1160
1161    for (loop_cnt = height; loop_cnt--;) {
1162        LD_SB2(src0_ptr, 16, src0, src1);
1163        src2 = LD_SB(src0_ptr + 24);
1164        src0_ptr += src_stride;
1165        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1166        src1_ptr += src2_stride;
1167
1168        XORI_B3_128_SB(src0, src1, src2);
1169
1170        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1171                   vec0, vec1, vec2, vec3);
1172        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1173                                 filt3);
1174        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1175                   vec0, vec1, vec2, vec3);
1176        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1177                                 filt3);
1178        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1179                   vec0, vec1, vec2, vec3);
1180        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1181                                 filt3);
1182        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1183                   vec0, vec1, vec2, vec3);
1184        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1185                                 filt3);
1186
1187        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1188                           in0, in1, in2, in3,
1189                           weight_vec, rnd_vec, offset_vec,
1190                           out0, out1, out2, out3);
1191
1192        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1193        ST_SH2(out0, out1, dst, 16);
1194        dst += dst_stride;
1195    }
1196}
1197
1198static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1199                                     int32_t src_stride,
1200                                     int16_t *src1_ptr,
1201                                     int32_t src2_stride,
1202                                     uint8_t *dst,
1203                                     int32_t dst_stride,
1204                                     const int8_t *filter,
1205                                     int32_t height,
1206                                     int32_t weight0,
1207                                     int32_t weight1,
1208                                     int32_t offset0,
1209                                     int32_t offset1,
1210                                     int32_t rnd_val)
1211{
1212    uint32_t loop_cnt;
1213    int32_t offset, weight, constant;
1214    v16i8 src0, src1, src2, src3, src4;
1215    v8i16 in0, in1, in2, in3;
1216    v8i16 filt0, filt1, filt2, filt3;
1217    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1218    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1219    v16i8 vec0, vec1, vec2, vec3;
1220    v8i16 dst0, dst1, dst2, dst3;
1221    v8i16 filter_vec, out0, out1, out2, out3;
1222    v4i32 weight_vec, offset_vec, rnd_vec;
1223
1224    src0_ptr -= 3;
1225    offset = (offset0 + offset1) << rnd_val;
1226    weight0 = weight0 & 0x0000FFFF;
1227    weight = weight0 | (weight1 << 16);
1228    constant = 128 * weight1;
1229    constant <<= 6;
1230    offset += constant;
1231
1232    offset_vec = __msa_fill_w(offset);
1233    weight_vec = __msa_fill_w(weight);
1234    rnd_vec = __msa_fill_w(rnd_val + 1);
1235
1236    filter_vec = LD_SH(filter);
1237    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1238
1239    mask1 = mask0 + 2;
1240    mask2 = mask0 + 4;
1241    mask3 = mask0 + 6;
1242    mask4 = mask0 + 8;
1243    mask5 = mask0 + 10;
1244    mask6 = mask0 + 12;
1245    mask7 = mask0 + 14;
1246
1247    for (loop_cnt = 64; loop_cnt--;) {
1248        LD_SB2(src0_ptr, 16, src0, src1);
1249        src2 = LD_SB(src0_ptr + 24);
1250        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1251        XORI_B3_128_SB(src0, src1, src2);
1252        LD_SB2(src0_ptr + 32, 8, src3, src4);
1253        src0_ptr += src_stride;
1254        XORI_B2_128_SB(src3, src4);
1255
1256        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1257                   vec0, vec1, vec2, vec3);
1258        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1259                                 filt3);
1260        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1261                   vec0, vec1, vec2, vec3);
1262        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1263                                 filt3);
1264        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1265                   vec0, vec1, vec2, vec3);
1266        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1267                                 filt3);
1268        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1269                   vec0, vec1, vec2, vec3);
1270        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1271                                 filt3);
1272
1273        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1274                           weight_vec, rnd_vec, offset_vec,
1275                           out0, out1, out2, out3);
1276
1277        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1278        ST_SH2(out0, out1, dst, 16);
1279
1280        LD_SH2(src1_ptr + 32, 8, in2, in3);
1281        src1_ptr += src2_stride;
1282
1283        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1284                   vec0, vec1, vec2, vec3);
1285        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1286                                 filt3);
1287        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1288                   vec0, vec1, vec2, vec3);
1289        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1290                                 filt3);
1291
1292        HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1293                           weight_vec, rnd_vec, offset_vec,
1294                           out0, out1);
1295
1296        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1297        ST_SH(out0, dst + 32);
1298        dst += dst_stride;
1299    }
1300}
1301
1302static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1303                                     int32_t src_stride,
1304                                     int16_t *src1_ptr,
1305                                     int32_t src2_stride,
1306                                     uint8_t *dst,
1307                                     int32_t dst_stride,
1308                                     const int8_t *filter,
1309                                     int32_t height,
1310                                     int32_t weight0,
1311                                     int32_t weight1,
1312                                     int32_t offset0,
1313                                     int32_t offset1,
1314                                     int32_t rnd_val)
1315{
1316    uint8_t *src0_ptr_tmp;
1317    uint8_t *dst_tmp;
1318    int16_t *src1_ptr_tmp;
1319    uint32_t loop_cnt, cnt;
1320    int32_t offset, weight, constant;
1321    v16i8 src0, src1, src2;
1322    v8i16 in0, in1, in2, in3;
1323    v8i16 filt0, filt1, filt2, filt3;
1324    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1325    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1326    v16i8 vec0, vec1, vec2, vec3;
1327    v8i16 dst0, dst1, dst2, dst3;
1328    v8i16 filter_vec, out0, out1, out2, out3;
1329    v4i32 weight_vec, offset_vec, rnd_vec;
1330
1331    src0_ptr -= 3;
1332    offset = (offset0 + offset1) << rnd_val;
1333    weight0 = weight0 & 0x0000FFFF;
1334    weight = weight0 | (weight1 << 16);
1335    constant = 128 * weight1;
1336    constant <<= 6;
1337    offset += constant;
1338
1339    offset_vec = __msa_fill_w(offset);
1340    weight_vec = __msa_fill_w(weight);
1341    rnd_vec = __msa_fill_w(rnd_val + 1);
1342
1343    filter_vec = LD_SH(filter);
1344    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1345
1346    mask1 = mask0 + 2;
1347    mask2 = mask0 + 4;
1348    mask3 = mask0 + 6;
1349    mask4 = mask0 + 8;
1350    mask5 = mask0 + 10;
1351    mask6 = mask0 + 12;
1352    mask7 = mask0 + 14;
1353
1354    for (loop_cnt = height; loop_cnt--;) {
1355        src0_ptr_tmp = src0_ptr;
1356        dst_tmp = dst;
1357        src1_ptr_tmp = src1_ptr;
1358
1359        for (cnt = 2; cnt--;) {
1360            LD_SB2(src0_ptr_tmp, 16, src0, src1);
1361            src2 = LD_SB(src0_ptr_tmp + 24);
1362            src0_ptr_tmp += 32;
1363            LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1364            src1_ptr_tmp += 32;
1365            XORI_B3_128_SB(src0, src1, src2);
1366
1367            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1368                       vec0, vec1, vec2, vec3);
1369            dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1370                                     filt2, filt3);
1371            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1372                       vec0, vec1, vec2, vec3);
1373            dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1374                                     filt2, filt3);
1375            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1376                       vec0, vec1, vec2, vec3);
1377            dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1378                                     filt2, filt3);
1379            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1380                       vec0, vec1, vec2, vec3);
1381            dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1382                                     filt2, filt3);
1383
1384            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1385                               in0, in1, in2, in3,
1386                               weight_vec, rnd_vec, offset_vec,
1387                               out0, out1, out2, out3);
1388
1389            PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1390            ST_SH2(out0, out1, dst_tmp, 16);
1391            dst_tmp += 32;
1392        }
1393
1394        src0_ptr += src_stride;
1395        src1_ptr += src2_stride;
1396        dst += dst_stride;
1397
1398    }
1399}
1400
1401static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1402                                    int32_t src_stride,
1403                                    int16_t *src1_ptr,
1404                                    int32_t src2_stride,
1405                                    uint8_t *dst,
1406                                    int32_t dst_stride,
1407                                    const int8_t *filter,
1408                                    int32_t height,
1409                                    int32_t weight0,
1410                                    int32_t weight1,
1411                                    int32_t offset0,
1412                                    int32_t offset1,
1413                                    int32_t rnd_val)
1414{
1415    uint32_t loop_cnt;
1416    int32_t offset, weight;
1417    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1418    v16i8 src11, src12, src13, src14;
1419    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1420    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1421    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1422    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1423    v16i8 src2110, src4332, src6554, src8776, src10998;
1424    v16i8 src12111110, src14131312;
1425    v8i16 dst10, dst32, dst54, dst76;
1426    v8i16 filt0, filt1, filt2, filt3;
1427    v8i16 filter_vec, out0, out1, out2, out3;
1428    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1429
1430    src0_ptr -= (3 * src_stride);
1431    offset = (offset0 + offset1) << rnd_val;
1432    weight0 = weight0 & 0x0000FFFF;
1433    weight = weight0 | (weight1 << 16);
1434
1435    const_vec = __msa_ldi_w(128);
1436    const_vec <<= 6;
1437    offset_vec = __msa_fill_w(offset);
1438    weight_vec = __msa_fill_w(weight);
1439    rnd_vec = __msa_fill_w(rnd_val + 1);
1440    weight1_vec = __msa_fill_w(weight1);
1441    offset_vec += const_vec * weight1_vec;
1442
1443    filter_vec = LD_SH(filter);
1444    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1445
1446    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1447    src0_ptr += (7 * src_stride);
1448
1449    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1450               src10_r, src32_r, src54_r, src21_r);
1451    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1452    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1453               src2110, src4332, src6554);
1454    XORI_B3_128_SB(src2110, src4332, src6554);
1455
1456    for (loop_cnt = (height >> 3); loop_cnt--;) {
1457        LD_SB8(src0_ptr, src_stride,
1458               src7, src8, src9, src10, src11, src12, src13, src14);
1459        src0_ptr += (8 * src_stride);
1460        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1461        src1_ptr += (8 * src2_stride);
1462
1463        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1464        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1465        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1466                   src76_r, src87_r, src98_r, src109_r);
1467        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1468                   src1110_r, src1211_r, src1312_r, src1413_r);
1469        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1470                   src1413_r, src1312_r,
1471                   src8776, src10998, src12111110, src14131312);
1472        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1473
1474        DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1475                    filt0, dst10, dst32, dst54, dst76);
1476        DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1477                     filt1, dst10, dst32, dst54, dst76);
1478        DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1479                     filt2, filt2, dst10, dst32, dst54, dst76);
1480        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1481                     filt3, filt3, dst10, dst32, dst54, dst76);
1482
1483        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1484                           in0, in1, in2, in3,
1485                           weight_vec, rnd_vec, offset_vec,
1486                           out0, out1, out2, out3);
1487
1488        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1489        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1490        dst += (8 * dst_stride);
1491
1492        src2110 = src10998;
1493        src4332 = src12111110;
1494        src6554 = src14131312;
1495        src6 = src14;
1496    }
1497}
1498
1499static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
1500                                    int32_t src_stride,
1501                                    int16_t *src1_ptr,
1502                                    int32_t src2_stride,
1503                                    uint8_t *dst,
1504                                    int32_t dst_stride,
1505                                    const int8_t *filter,
1506                                    int32_t height,
1507                                    int32_t weight0,
1508                                    int32_t weight1,
1509                                    int32_t offset0,
1510                                    int32_t offset1,
1511                                    int32_t rnd_val)
1512{
1513    uint32_t loop_cnt;
1514    int32_t offset, weight;
1515    v16i8 src0, src1, src2, src3, src4, src5;
1516    v16i8 src6, src7, src8, src9, src10;
1517    v8i16 in0, in1, in2, in3;
1518    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1519    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1520    v8i16 tmp0, tmp1, tmp2, tmp3;
1521    v8i16 filt0, filt1, filt2, filt3;
1522    v8i16 filter_vec, out0, out1, out2, out3;
1523    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1524
1525    src0_ptr -= (3 * src_stride);
1526    offset = (offset0 + offset1) << rnd_val;
1527    weight0 = weight0 & 0x0000FFFF;
1528    weight = weight0 | (weight1 << 16);
1529
1530    const_vec = __msa_ldi_w(128);
1531    const_vec <<= 6;
1532    offset_vec = __msa_fill_w(offset);
1533    weight_vec = __msa_fill_w(weight);
1534    rnd_vec = __msa_fill_w(rnd_val + 1);
1535    weight1_vec = __msa_fill_w(weight1);
1536    offset_vec += const_vec * weight1_vec;
1537
1538    filter_vec = LD_SH(filter);
1539    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1540
1541    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1542    src0_ptr += (7 * src_stride);
1543    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1544
1545    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1546               src10_r, src32_r, src54_r, src21_r);
1547    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1548
1549    for (loop_cnt = (height >> 2); loop_cnt--;) {
1550        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1551        src0_ptr += (4 * src_stride);
1552        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1553        src1_ptr += (4 * src2_stride);
1554
1555        XORI_B4_128_SB(src7, src8, src9, src10);
1556        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1557                   src76_r, src87_r, src98_r, src109_r);
1558
1559        DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1560                    filt0, tmp0, tmp1, tmp2, tmp3);
1561        DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1562                     filt1, tmp0, tmp1, tmp2, tmp3);
1563        DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1564                     filt2, tmp0, tmp1, tmp2, tmp3);
1565        DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1566                     filt3, tmp0, tmp1, tmp2, tmp3);
1567
1568        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1569                           in0, in1, in2, in3,
1570                           weight_vec, rnd_vec, offset_vec,
1571                           out0, out1, out2, out3);
1572
1573        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1574        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1575        dst += (4 * dst_stride);
1576
1577        src10_r = src54_r;
1578        src32_r = src76_r;
1579        src54_r = src98_r;
1580        src21_r = src65_r;
1581        src43_r = src87_r;
1582        src65_r = src109_r;
1583        src6 = src10;
1584    }
1585}
1586
1587static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
1588                                     int32_t src_stride,
1589                                     int16_t *src1_ptr,
1590                                     int32_t src2_stride,
1591                                     uint8_t *dst,
1592                                     int32_t dst_stride,
1593                                     const int8_t *filter,
1594                                     int32_t height,
1595                                     int32_t weight0,
1596                                     int32_t weight1,
1597                                     int32_t offset0,
1598                                     int32_t offset1,
1599                                     int32_t rnd_val)
1600{
1601    uint32_t loop_cnt;
1602    int32_t offset, weight;
1603    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1604    v8i16 in0, in1, in2, in3;
1605    v16i8 src10_r, src32_r, src54_r, src76_r;
1606    v16i8 src21_r, src43_r, src65_r, src87_r;
1607    v8i16 tmp0, tmp1, tmp2;
1608    v16i8 src10_l, src32_l, src54_l, src76_l;
1609    v16i8 src21_l, src43_l, src65_l, src87_l;
1610    v16i8 src2110, src4332, src6554, src8776;
1611    v8i16 filt0, filt1, filt2, filt3;
1612    v8i16 out0, out1, out2, filter_vec;
1613    v4i32 dst2_r, dst2_l;
1614    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1615
1616    src0_ptr -= (3 * src_stride);
1617    offset = (offset0 + offset1) << rnd_val;
1618    weight0 = weight0 & 0x0000FFFF;
1619    weight = weight0 | (weight1 << 16);
1620
1621    const_vec = __msa_ldi_w(128);
1622    const_vec <<= 6;
1623    offset_vec = __msa_fill_w(offset);
1624    weight_vec = __msa_fill_w(weight);
1625    rnd_vec = __msa_fill_w(rnd_val + 1);
1626    weight1_vec = __msa_fill_w(weight1);
1627    offset_vec += const_vec * weight1_vec;
1628
1629    filter_vec = LD_SH(filter);
1630    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1631
1632    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1633    src0_ptr += (7 * src_stride);
1634    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1635
1636    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1637               src10_r, src32_r, src54_r, src21_r);
1638    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1639    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1640               src10_l, src32_l, src54_l, src21_l);
1641    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1642    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1643               src2110, src4332, src6554);
1644
1645    for (loop_cnt = 8; loop_cnt--;) {
1646        LD_SB2(src0_ptr, src_stride, src7, src8);
1647        src0_ptr += (2 * src_stride);
1648        LD_SH2(src1_ptr, src2_stride, in0, in1);
1649        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1650        src1_ptr += (2 * src2_stride);
1651        in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1652        XORI_B2_128_SB(src7, src8);
1653
1654        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1655        ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1656        src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1657
1658        DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1659                    tmp0, tmp1, tmp2);
1660        DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1661        tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1662        DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1663        tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1664        DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1665        tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1666
1667        HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1668                           weight_vec, rnd_vec, offset_vec,
1669                           out0, out1);
1670
1671        ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1672        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1673                                 (v8i16) weight_vec);
1674        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1675                                 (v8i16) weight_vec);
1676        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1677        out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1678        CLIP_SH_0_255(out2);
1679        PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1680        ST_D2(out0, 0, 1, dst, dst_stride);
1681        ST_W2(out2, 0, 1, dst + 8, dst_stride);
1682        dst += (2 * dst_stride);
1683
1684        src10_r = src32_r;
1685        src32_r = src54_r;
1686        src54_r = src76_r;
1687        src21_r = src43_r;
1688        src43_r = src65_r;
1689        src65_r = src87_r;
1690        src2110 = src4332;
1691        src4332 = src6554;
1692        src6554 = src8776;
1693        src6 = src8;
1694    }
1695}
1696
1697static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
1698                                              int32_t src_stride,
1699                                              int16_t *src1_ptr,
1700                                              int32_t src2_stride,
1701                                              uint8_t *dst,
1702                                              int32_t dst_stride,
1703                                              const int8_t *filter,
1704                                              int32_t height,
1705                                              int32_t weight0,
1706                                              int32_t weight1,
1707                                              int32_t offset0,
1708                                              int32_t offset1,
1709                                              int32_t rnd_val,
1710                                              int32_t width)
1711{
1712    uint8_t *src0_ptr_tmp;
1713    int16_t *src1_ptr_tmp;
1714    uint8_t *dst_tmp;
1715    uint32_t loop_cnt, cnt;
1716    int32_t offset, weight;
1717    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1718    v8i16 in0, in1, in2, in3;
1719    v16i8 src10_r, src32_r, src54_r, src76_r;
1720    v16i8 src21_r, src43_r, src65_r, src87_r;
1721    v16i8 src10_l, src32_l, src54_l, src76_l;
1722    v16i8 src21_l, src43_l, src65_l, src87_l;
1723    v8i16 tmp0, tmp1, tmp2, tmp3;
1724    v8i16 filt0, filt1, filt2, filt3;
1725    v8i16 filter_vec;
1726    v8i16 out0, out1, out2, out3;
1727    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1728
1729    src0_ptr -= (3 * src_stride);
1730
1731    offset = (offset0 + offset1) << rnd_val;
1732    weight0 = weight0 & 0x0000FFFF;
1733    weight = weight0 | (weight1 << 16);
1734
1735    const_vec = __msa_ldi_w(128);
1736    const_vec <<= 6;
1737    offset_vec = __msa_fill_w(offset);
1738    weight_vec = __msa_fill_w(weight);
1739    rnd_vec = __msa_fill_w(rnd_val + 1);
1740    weight1_vec = __msa_fill_w(weight1);
1741    offset_vec += const_vec * weight1_vec;
1742
1743    filter_vec = LD_SH(filter);
1744    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1745
1746    for (cnt = (width >> 4); cnt--;) {
1747        src0_ptr_tmp = src0_ptr;
1748        src1_ptr_tmp = src1_ptr;
1749        dst_tmp = dst;
1750
1751        LD_SB7(src0_ptr_tmp, src_stride,
1752               src0, src1, src2, src3, src4, src5, src6);
1753        src0_ptr_tmp += (7 * src_stride);
1754
1755        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1756        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1757                   src10_r, src32_r, src54_r, src21_r);
1758        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1759        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1760                   src10_l, src32_l, src54_l, src21_l);
1761        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1762
1763        for (loop_cnt = (height >> 1); loop_cnt--;) {
1764            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1765            src0_ptr_tmp += (2 * src_stride);
1766            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1767            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1768            src1_ptr_tmp += (2 * src2_stride);
1769
1770            XORI_B2_128_SB(src7, src8);
1771            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1772            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1773
1774            DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1775                        filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1776            DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1777                         filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1778            DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1779                         filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1780            DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1781                         filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1782
1783            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1784                               in0, in1, in2, in3,
1785                               weight_vec, rnd_vec, offset_vec,
1786                               out0, out1, out2, out3);
1787
1788            PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1789            ST_SH2(out0, out1, dst_tmp, dst_stride);
1790            dst_tmp += (2 * dst_stride);
1791
1792            src10_r = src32_r;
1793            src32_r = src54_r;
1794            src54_r = src76_r;
1795            src21_r = src43_r;
1796            src43_r = src65_r;
1797            src65_r = src87_r;
1798            src10_l = src32_l;
1799            src32_l = src54_l;
1800            src54_l = src76_l;
1801            src21_l = src43_l;
1802            src43_l = src65_l;
1803            src65_l = src87_l;
1804            src6 = src8;
1805        }
1806
1807        src0_ptr += 16;
1808        src1_ptr += 16;
1809        dst += 16;
1810    }
1811}
1812
1813static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
1814                                     int32_t src_stride,
1815                                     int16_t *src1_ptr,
1816                                     int32_t src2_stride,
1817                                     uint8_t *dst,
1818                                     int32_t dst_stride,
1819                                     const int8_t *filter,
1820                                     int32_t height,
1821                                     int32_t weight0,
1822                                     int32_t weight1,
1823                                     int32_t offset0,
1824                                     int32_t offset1,
1825                                     int32_t rnd_val)
1826{
1827    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1828                                      src1_ptr, src2_stride,
1829                                      dst, dst_stride, filter, height,
1830                                      weight0, weight1, offset0, offset1,
1831                                      rnd_val, 16);
1832}
1833
1834static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
1835                                     int32_t src_stride,
1836                                     int16_t *src1_ptr,
1837                                     int32_t src2_stride,
1838                                     uint8_t *dst,
1839                                     int32_t dst_stride,
1840                                     const int8_t *filter,
1841                                     int32_t height,
1842                                     int32_t weight0,
1843                                     int32_t weight1,
1844                                     int32_t offset0,
1845                                     int32_t offset1,
1846                                     int32_t rnd_val)
1847{
1848    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1849                                      src1_ptr, src2_stride,
1850                                      dst, dst_stride, filter, height,
1851                                      weight0, weight1, offset0, offset1,
1852                                      rnd_val, 16);
1853    hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1854                            src1_ptr + 16, src2_stride,
1855                            dst + 16, dst_stride, filter, height,
1856                            weight0, weight1, offset0, offset1, rnd_val);
1857}
1858
1859static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1860                                     int32_t src_stride,
1861                                     int16_t *src1_ptr,
1862                                     int32_t src2_stride,
1863                                     uint8_t *dst,
1864                                     int32_t dst_stride,
1865                                     const int8_t *filter,
1866                                     int32_t height,
1867                                     int32_t weight0,
1868                                     int32_t weight1,
1869                                     int32_t offset0,
1870                                     int32_t offset1,
1871                                     int32_t rnd_val)
1872{
1873    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1874                                      src1_ptr, src2_stride,
1875                                      dst, dst_stride, filter, height,
1876                                      weight0, weight1, offset0, offset1,
1877                                      rnd_val, 32);
1878}
1879
1880static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1881                                     int32_t src_stride,
1882                                     int16_t *src1_ptr,
1883                                     int32_t src2_stride,
1884                                     uint8_t *dst,
1885                                     int32_t dst_stride,
1886                                     const int8_t *filter,
1887                                     int32_t height,
1888                                     int32_t weight0,
1889                                     int32_t weight1,
1890                                     int32_t offset0,
1891                                     int32_t offset1,
1892                                     int32_t rnd_val)
1893{
1894    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1895                                      src1_ptr, src2_stride,
1896                                      dst, dst_stride, filter, height,
1897                                      weight0, weight1, offset0, offset1,
1898                                      rnd_val, 48);
1899}
1900
1901static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1902                                     int32_t src_stride,
1903                                     int16_t *src1_ptr,
1904                                     int32_t src2_stride,
1905                                     uint8_t *dst,
1906                                     int32_t dst_stride,
1907                                     const int8_t *filter,
1908                                     int32_t height,
1909                                     int32_t weight0,
1910                                     int32_t weight1,
1911                                     int32_t offset0,
1912                                     int32_t offset1,
1913                                     int32_t rnd_val)
1914{
1915    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1916                                      src1_ptr, src2_stride,
1917                                      dst, dst_stride, filter, height,
1918                                      weight0, weight1, offset0, offset1,
1919                                      rnd_val, 64);
1920}
1921
1922static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1923                                    int32_t src_stride,
1924                                    int16_t *src1_ptr,
1925                                    int32_t src2_stride,
1926                                    uint8_t *dst,
1927                                    int32_t dst_stride,
1928                                    const int8_t *filter_x,
1929                                    const int8_t *filter_y,
1930                                    int32_t height,
1931                                    int32_t weight0,
1932                                    int32_t weight1,
1933                                    int32_t offset0,
1934                                    int32_t offset1,
1935                                    int32_t rnd_val)
1936{
1937    uint32_t loop_cnt;
1938    uint64_t tp0, tp1;
1939    int32_t offset, weight;
1940    v16u8 out;
1941    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1942    v8i16 in0 = { 0 }, in1 = { 0 };
1943    v8i16 filt0, filt1, filt2, filt3;
1944    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1945    v16i8 mask1, mask2, mask3;
1946    v8i16 filter_vec, weight_vec;
1947    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1948    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1949    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1950    v8i16 tmp0, tmp1, tmp2, tmp3;
1951    v8i16 dst10, dst32, dst54, dst76;
1952    v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1953    v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1954    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1955
1956    src0_ptr -= ((3 * src_stride) + 3);
1957
1958    filter_vec = LD_SH(filter_x);
1959    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1960
1961    filter_vec = LD_SH(filter_y);
1962    UNPCK_R_SB_SH(filter_vec, filter_vec);
1963
1964    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1965
1966    mask1 = mask0 + 2;
1967    mask2 = mask0 + 4;
1968    mask3 = mask0 + 6;
1969
1970    offset = (offset0 + offset1) << rnd_val;
1971    weight0 = weight0 & 0x0000FFFF;
1972    weight = weight0 | (weight1 << 16);
1973
1974    const_vec = __msa_fill_w((128 * weight1));
1975    const_vec <<= 6;
1976    offset_vec = __msa_fill_w(offset);
1977    rnd_vec = __msa_fill_w(rnd_val + 1);
1978    offset_vec += const_vec;
1979    weight_vec = (v8i16) __msa_fill_w(weight);
1980
1981    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1982    src0_ptr += (7 * src_stride);
1983
1984    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1985
1986    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1987    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1988    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1989               vec8, vec9, vec10, vec11);
1990    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1991               vec12, vec13, vec14, vec15);
1992
1993    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1994                              filt3);
1995    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1996                              filt3);
1997    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1998                              filt3);
1999    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2000                              filt3);
2001
2002    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2003    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2004    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2005
2006    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2007
2008    for (loop_cnt = height >> 2; loop_cnt--;) {
2009        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2010        src0_ptr += (4 * src_stride);
2011        XORI_B4_128_SB(src7, src8, src9, src10);
2012
2013        LD2(src1_ptr, src2_stride, tp0, tp1);
2014        INSERT_D2_SH(tp0, tp1, in0);
2015        src1_ptr += (2 * src2_stride);
2016        LD2(src1_ptr, src2_stride, tp0, tp1);
2017        INSERT_D2_SH(tp0, tp1, in1);
2018        src1_ptr += (2 * src2_stride);
2019
2020        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2021                   vec0, vec1, vec2, vec3);
2022        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2023                   vec4, vec5, vec6, vec7);
2024        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2025                                  filt3);
2026        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2027                                   filt3);
2028
2029        dst76 = __msa_ilvr_h(dst97, dst66);
2030        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2031        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2032        dst98 = __msa_ilvr_h(dst66, dst108);
2033
2034        dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2035                              filt_h2, filt_h3);
2036        dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2037                              filt_h2, filt_h3);
2038        dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2039                              filt_h2, filt_h3);
2040        dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2041                              filt_h2, filt_h3);
2042        SRA_4V(dst0, dst1, dst2, dst3, 6);
2043        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2044        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2045        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2046        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2047        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2048        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2049        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2050        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2051        CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2052        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2053        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2054        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2055        dst += (4 * dst_stride);
2056
2057        dst10 = dst54;
2058        dst32 = dst76;
2059        dst54 = dst98;
2060        dst21 = dst65;
2061        dst43 = dst87;
2062        dst65 = dst109;
2063        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2064    }
2065}
2066
2067static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
2068                                             int32_t src_stride,
2069                                             int16_t *src1_ptr,
2070                                             int32_t src2_stride,
2071                                             uint8_t *dst,
2072                                             int32_t dst_stride,
2073                                             const int8_t *filter_x,
2074                                             const int8_t *filter_y,
2075                                             int32_t height,
2076                                             int32_t weight0,
2077                                             int32_t weight1,
2078                                             int32_t offset0,
2079                                             int32_t offset1,
2080                                             int32_t rnd_val,
2081                                             int32_t width8mult)
2082{
2083    uint32_t loop_cnt, cnt;
2084    int32_t offset, weight;
2085    uint8_t *src0_ptr_tmp;
2086    int16_t *src1_ptr_tmp;
2087    uint8_t *dst_tmp;
2088    v16u8 out;
2089    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2090    v8i16 in0, in1;
2091    v8i16 filt0, filt1, filt2, filt3;
2092    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2093    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2094    v16i8 mask1, mask2, mask3;
2095    v8i16 filter_vec, weight_vec;
2096    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2097    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2098    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2099    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2100    v8i16 tmp0, tmp1, tmp2, tmp3;
2101    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2102    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2103    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2104    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2105    v4i32 offset_vec, rnd_vec, const_vec;
2106
2107    src0_ptr -= ((3 * src_stride) + 3);
2108
2109    offset = (offset0 + offset1) << rnd_val;
2110    weight0 = weight0 & 0x0000FFFF;
2111    weight = weight0 | (weight1 << 16);
2112
2113    const_vec = __msa_fill_w((128 * weight1));
2114    const_vec <<= 6;
2115    offset_vec = __msa_fill_w(offset);
2116    rnd_vec = __msa_fill_w(rnd_val + 1);
2117    offset_vec += const_vec;
2118    weight_vec = (v8i16) __msa_fill_w(weight);
2119
2120    filter_vec = LD_SH(filter_x);
2121    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2122
2123    filter_vec = LD_SH(filter_y);
2124    UNPCK_R_SB_SH(filter_vec, filter_vec);
2125
2126    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2127
2128    mask1 = mask0 + 2;
2129    mask2 = mask0 + 4;
2130    mask3 = mask0 + 6;
2131
2132    for (cnt = width8mult; cnt--;) {
2133        src0_ptr_tmp = src0_ptr;
2134        src1_ptr_tmp = src1_ptr;
2135        dst_tmp = dst;
2136
2137        LD_SB7(src0_ptr_tmp, src_stride,
2138               src0, src1, src2, src3, src4, src5, src6);
2139        src0_ptr_tmp += (7 * src_stride);
2140
2141        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2142
2143        /* row 0 row 1 row 2 row 3 */
2144        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2145                   vec0, vec1, vec2, vec3);
2146        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2147                   vec4, vec5, vec6, vec7);
2148        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2149                   vec8, vec9, vec10, vec11);
2150        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2151                   vec12, vec13, vec14, vec15);
2152
2153        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2154                                 filt3);
2155        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2156                                 filt3);
2157        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2158                                 filt3);
2159        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2160                                 filt2, filt3);
2161
2162        /* row 4 row 5 row 6 */
2163        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2164                   vec0, vec1, vec2, vec3);
2165        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2166                   vec4, vec5, vec6, vec7);
2167        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2168                   vec8, vec9, vec10, vec11);
2169
2170        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2171                                 filt3);
2172        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2173                                 filt3);
2174        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2175                                 filt3);
2176
2177        for (loop_cnt = height >> 1; loop_cnt--;) {
2178            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2179            XORI_B2_128_SB(src7, src8);
2180            src0_ptr_tmp += 2 * src_stride;
2181
2182            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2183            src1_ptr_tmp += (2 * src2_stride);
2184
2185            ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2186                       dst32_r, dst54_r, dst21_r);
2187            ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2188                       dst32_l, dst54_l, dst21_l);
2189            ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2190            ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2191
2192            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2193                       vec0, vec1, vec2, vec3);
2194            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2195                                     filt2, filt3);
2196
2197            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2198            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2199                                    filt_h0, filt_h1, filt_h2, filt_h3);
2200            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2201                                    filt_h0, filt_h1, filt_h2, filt_h3);
2202
2203            dst0_r >>= 6;
2204            dst0_l >>= 6;
2205
2206            /* row 8 */
2207            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2208                       vec0, vec1, vec2, vec3);
2209            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2210                                     filt2, filt3);
2211
2212            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2213            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2214                                    filt_h0, filt_h1, filt_h2, filt_h3);
2215            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2216                                    filt_h0, filt_h1, filt_h2, filt_h3);
2217
2218            dst1_r >>= 6;
2219            dst1_l >>= 6;
2220
2221            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2222            ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2223            ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2224            dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2225            dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2226            dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2227            dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2228            SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2229            CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
2230            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2231            out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2232            ST_D2(out, 0, 1, dst_tmp, dst_stride);
2233            dst_tmp += (2 * dst_stride);
2234
2235            dst0 = dst2;
2236            dst1 = dst3;
2237            dst2 = dst4;
2238            dst3 = dst5;
2239            dst4 = dst6;
2240            dst5 = dst7;
2241            dst6 = dst8;
2242        }
2243
2244        src0_ptr += 8;
2245        src1_ptr += 8;
2246        dst += 8;
2247    }
2248}
2249
2250static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
2251                                    int32_t src_stride,
2252                                    int16_t *src1_ptr,
2253                                    int32_t src2_stride,
2254                                    uint8_t *dst,
2255                                    int32_t dst_stride,
2256                                    const int8_t *filter_x,
2257                                    const int8_t *filter_y,
2258                                    int32_t height,
2259                                    int32_t weight0,
2260                                    int32_t weight1,
2261                                    int32_t offset0,
2262                                    int32_t offset1,
2263                                    int32_t rnd_val)
2264{
2265    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2266                                     src1_ptr, src2_stride,
2267                                     dst, dst_stride, filter_x, filter_y,
2268                                     height, weight0, weight1, offset0,
2269                                     offset1, rnd_val, 1);
2270}
2271
2272static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
2273                                     int32_t src_stride,
2274                                     int16_t *src1_ptr,
2275                                     int32_t src2_stride,
2276                                     uint8_t *dst,
2277                                     int32_t dst_stride,
2278                                     const int8_t *filter_x,
2279                                     const int8_t *filter_y,
2280                                     int32_t height,
2281                                     int32_t weight0,
2282                                     int32_t weight1,
2283                                     int32_t offset0,
2284                                     int32_t offset1,
2285                                     int32_t rnd_val)
2286{
2287    uint32_t loop_cnt;
2288    uint8_t *src0_ptr_tmp, *dst_tmp;
2289    int16_t *src1_ptr_tmp;
2290    int32_t offset, weight;
2291    uint64_t tp0, tp1;
2292    v16u8 out;
2293    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2295    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2296    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2297    v8i16 in0 = { 0 }, in1 = { 0 };
2298    v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2299    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2300    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2301    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2302    v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2303    v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2304    v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2305    v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2306
2307    src0_ptr -= ((3 * src_stride) + 3);
2308
2309    offset = (offset0 + offset1) << rnd_val;
2310    weight0 = weight0 & 0x0000FFFF;
2311    weight = weight0 | (weight1 << 16);
2312
2313    const_vec = __msa_fill_w((128 * weight1));
2314    const_vec <<= 6;
2315    offset_vec = __msa_fill_w(offset);
2316    rnd_vec = __msa_fill_w(rnd_val + 1);
2317    offset_vec += const_vec;
2318    weight_vec = (v8i16) __msa_fill_w(weight);
2319
2320    filter_vec = LD_SH(filter_x);
2321    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2322
2323    filter_vec = LD_SH(filter_y);
2324    UNPCK_R_SB_SH(filter_vec, filter_vec);
2325
2326    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2327
2328    mask0 = LD_SB(ff_hevc_mask_arr);
2329    mask1 = mask0 + 2;
2330    mask2 = mask0 + 4;
2331    mask3 = mask0 + 6;
2332
2333    src0_ptr_tmp = src0_ptr;
2334    src1_ptr_tmp = src1_ptr;
2335    dst_tmp = dst;
2336
2337    LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2338    src0_ptr_tmp += (7 * src_stride);
2339    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2340
2341    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2342    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2343    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2344               vec11);
2345    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2346               vec15);
2347    dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2348                              filt3);
2349    dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2350                              filt3);
2351    dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2352                              filt3);
2353    dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2354                              filt2, filt3);
2355    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2356    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2357    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2358               vec11);
2359    dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2360                              filt3);
2361    dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2362                              filt3);
2363    dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2364                              filt3);
2365
2366    for (loop_cnt = 8; loop_cnt--;) {
2367        LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2368        src0_ptr_tmp += (2 * src_stride);
2369        XORI_B2_128_SB(src7, src8);
2370
2371        LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2372        src1_ptr_tmp += (2 * src2_stride);
2373
2374        ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2375                   dst10_r, dst32_r, dst54_r, dst21_r);
2376        ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2377                   dst10_l, dst32_l, dst54_l, dst21_l);
2378        ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2379        ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2380
2381        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2382                   vec3);
2383        dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2384                                  filt3);
2385
2386        ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2387        dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2388                              filt_h1, filt_h2, filt_h3);
2389        dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2390                              filt_h1, filt_h2, filt_h3);
2391        dst0 >>= 6;
2392        dst1 >>= 6;
2393
2394        VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2395                   vec3);
2396        dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2397                                  filt3);
2398
2399        ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2400        dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2401                              filt_h1, filt_h2, filt_h3);
2402        dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2403                              filt_h1, filt_h2, filt_h3);
2404        dst2 >>= 6;
2405        dst3 >>= 6;
2406
2407        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2408        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2409        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2410        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2411        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2412        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2413        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2414        SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2415        CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
2416        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2417        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2418        ST_D2(out, 0, 1, dst_tmp, dst_stride);
2419        dst_tmp += (2 * dst_stride);
2420
2421        dsth0 = dsth2;
2422        dsth1 = dsth3;
2423        dsth2 = dsth4;
2424        dsth3 = dsth5;
2425        dsth4 = dsth6;
2426        dsth5 = dsth7;
2427        dsth6 = dsth8;
2428    }
2429
2430    src0_ptr += 8;
2431    src1_ptr += 8;
2432    dst += 8;
2433
2434    mask4 = LD_SB(ff_hevc_mask_arr + 16);
2435    mask5 = mask4 + 2;
2436    mask6 = mask4 + 4;
2437    mask7 = mask4 + 6;
2438
2439    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2440    src0_ptr += (7 * src_stride);
2441    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2442
2443    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2444    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2445    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2446               vec11);
2447    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2448               vec15);
2449    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2450                              filt3);
2451    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2452                              filt3);
2453    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2454                              filt3);
2455    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2456                              filt3);
2457    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2458    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2459    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2460
2461    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2462
2463    for (loop_cnt = 4; loop_cnt--;) {
2464        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2465        src0_ptr += (4 * src_stride);
2466        XORI_B4_128_SB(src7, src8, src9, src10);
2467
2468        LD2(src1_ptr, src2_stride, tp0, tp1);
2469        INSERT_D2_SH(tp0, tp1, in0);
2470        src1_ptr += (2 * src2_stride);
2471        LD2(src1_ptr, src2_stride, tp0, tp1);
2472        INSERT_D2_SH(tp0, tp1, in1);
2473        src1_ptr += (2 * src2_stride);
2474
2475        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2476                   vec3);
2477        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2478                   vec7);
2479        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2480                                  filt3);
2481        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2482                                   filt3);
2483
2484        dst76 = __msa_ilvr_h(dst97, dst66);
2485        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2486        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2487        dst98 = __msa_ilvr_h(dst66, dst108);
2488
2489        dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2490                              filt_h2, filt_h3);
2491        dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2492                              filt_h2, filt_h3);
2493        dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2494                              filt_h2, filt_h3);
2495        dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2496                              filt_h2, filt_h3);
2497        SRA_4V(dst0, dst1, dst2, dst3, 6);
2498        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2499        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2500        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2501        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2502        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2503        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2504        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2505        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2506        CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2507        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2508        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2509        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2510        dst += (4 * dst_stride);
2511
2512        dst10 = dst54;
2513        dst32 = dst76;
2514        dst54 = dst98;
2515        dst21 = dst65;
2516        dst43 = dst87;
2517        dst65 = dst109;
2518        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2519    }
2520}
2521
2522static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
2523                                     int32_t src_stride,
2524                                     int16_t *src1_ptr,
2525                                     int32_t src2_stride,
2526                                     uint8_t *dst,
2527                                     int32_t dst_stride,
2528                                     const int8_t *filter_x,
2529                                     const int8_t *filter_y,
2530                                     int32_t height,
2531                                     int32_t weight0,
2532                                     int32_t weight1,
2533                                     int32_t offset0,
2534                                     int32_t offset1,
2535                                     int32_t rnd_val)
2536{
2537    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2538                                     src1_ptr, src2_stride,
2539                                     dst, dst_stride, filter_x, filter_y,
2540                                     height, weight0, weight1, offset0,
2541                                     offset1, rnd_val, 2);
2542}
2543
2544static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
2545                                     int32_t src_stride,
2546                                     int16_t *src1_ptr,
2547                                     int32_t src2_stride,
2548                                     uint8_t *dst,
2549                                     int32_t dst_stride,
2550                                     const int8_t *filter_x,
2551                                     const int8_t *filter_y,
2552                                     int32_t height,
2553                                     int32_t weight0,
2554                                     int32_t weight1,
2555                                     int32_t offset0,
2556                                     int32_t offset1,
2557                                     int32_t rnd_val)
2558{
2559    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2560                                     src1_ptr, src2_stride,
2561                                     dst, dst_stride, filter_x, filter_y,
2562                                     height, weight0, weight1, offset0,
2563                                     offset1, rnd_val, 3);
2564}
2565
2566static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
2567                                     int32_t src_stride,
2568                                     int16_t *src1_ptr,
2569                                     int32_t src2_stride,
2570                                     uint8_t *dst,
2571                                     int32_t dst_stride,
2572                                     const int8_t *filter_x,
2573                                     const int8_t *filter_y,
2574                                     int32_t height,
2575                                     int32_t weight0,
2576                                     int32_t weight1,
2577                                     int32_t offset0,
2578                                     int32_t offset1,
2579                                     int32_t rnd_val)
2580{
2581    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2582                                     src1_ptr, src2_stride,
2583                                     dst, dst_stride, filter_x, filter_y,
2584                                     height, weight0, weight1, offset0,
2585                                     offset1, rnd_val, 4);
2586}
2587
2588static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
2589                                     int32_t src_stride,
2590                                     int16_t *src1_ptr,
2591                                     int32_t src2_stride,
2592                                     uint8_t *dst,
2593                                     int32_t dst_stride,
2594                                     const int8_t *filter_x,
2595                                     const int8_t *filter_y,
2596                                     int32_t height,
2597                                     int32_t weight0,
2598                                     int32_t weight1,
2599                                     int32_t offset0,
2600                                     int32_t offset1,
2601                                     int32_t rnd_val)
2602{
2603    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2604                                     src1_ptr, src2_stride,
2605                                     dst, dst_stride, filter_x, filter_y,
2606                                     height, weight0, weight1, offset0,
2607                                     offset1, rnd_val, 6);
2608}
2609
2610static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
2611                                     int32_t src_stride,
2612                                     int16_t *src1_ptr,
2613                                     int32_t src2_stride,
2614                                     uint8_t *dst,
2615                                     int32_t dst_stride,
2616                                     const int8_t *filter_x,
2617                                     const int8_t *filter_y,
2618                                     int32_t height,
2619                                     int32_t weight0,
2620                                     int32_t weight1,
2621                                     int32_t offset0,
2622                                     int32_t offset1,
2623                                     int32_t rnd_val)
2624{
2625    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2626                                     src1_ptr, src2_stride,
2627                                     dst, dst_stride, filter_x, filter_y,
2628                                     height, weight0, weight1, offset0,
2629                                     offset1, rnd_val, 8);
2630}
2631
2632static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
2633                                     int32_t src_stride,
2634                                     int16_t *src1_ptr,
2635                                     int32_t src2_stride,
2636                                     uint8_t *dst,
2637                                     int32_t dst_stride,
2638                                     const int8_t *filter,
2639                                     int32_t weight0,
2640                                     int32_t weight1,
2641                                     int32_t offset0,
2642                                     int32_t offset1,
2643                                     int32_t rnd_val)
2644{
2645    int32_t offset, weight, constant;
2646    v8i16 filt0, filt1;
2647    v16i8 src0, src1;
2648    v8i16 in0, in1;
2649    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2650    v16i8 mask1, vec0, vec1;
2651    v8i16 dst0;
2652    v4i32 dst0_r, dst0_l;
2653    v8i16 out0, filter_vec;
2654    v4i32 weight_vec, offset_vec, rnd_vec;
2655
2656    src0_ptr -= 1;
2657
2658    filter_vec = LD_SH(filter);
2659    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2660
2661    mask1 = mask0 + 2;
2662
2663    offset = (offset0 + offset1) << rnd_val;
2664    weight0 = weight0 & 0x0000FFFF;
2665    weight = weight0 | (weight1 << 16);
2666    constant = 128 * weight1;
2667    constant <<= 6;
2668    offset += constant;
2669
2670    offset_vec = __msa_fill_w(offset);
2671    weight_vec = __msa_fill_w(weight);
2672    rnd_vec = __msa_fill_w(rnd_val + 1);
2673
2674    LD_SB2(src0_ptr, src_stride, src0, src1);
2675    LD_SH2(src1_ptr, src2_stride, in0, in1);
2676    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2677    XORI_B2_128_SB(src0, src1);
2678
2679    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2680    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2681
2682    ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2683    dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2684    dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2685    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2686    out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2687    CLIP_SH_0_255(out0);
2688    out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2689    ST_W2(out0, 0, 1, dst, dst_stride);
2690}
2691
2692static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
2693                                     int32_t src_stride,
2694                                     int16_t *src1_ptr,
2695                                     int32_t src2_stride,
2696                                     uint8_t *dst,
2697                                     int32_t dst_stride,
2698                                     const int8_t *filter,
2699                                     int32_t weight0,
2700                                     int32_t weight1,
2701                                     int32_t offset0,
2702                                     int32_t offset1,
2703                                     int32_t rnd_val)
2704{
2705    int32_t offset, weight, constant;
2706    v8i16 filt0, filt1;
2707    v16i8 src0, src1, src2, src3;
2708    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2709    v16i8 mask1;
2710    v8i16 dst0, dst1;
2711    v16i8 vec0, vec1;
2712    v8i16 in0, in1, in2, in3;
2713    v8i16 filter_vec;
2714    v4i32 weight_vec, offset_vec, rnd_vec;
2715
2716    src0_ptr -= 1;
2717
2718    /* rearranging filter */
2719    filter_vec = LD_SH(filter);
2720    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2721
2722    mask1 = mask0 + 2;
2723
2724    offset = (offset0 + offset1) << rnd_val;
2725    weight0 = weight0 & 0x0000FFFF;
2726    weight = weight0 | (weight1 << 16);
2727    constant = 128 * weight1;
2728    constant <<= 6;
2729    offset += constant;
2730
2731    offset_vec = __msa_fill_w(offset);
2732    weight_vec = __msa_fill_w(weight);
2733    rnd_vec = __msa_fill_w(rnd_val + 1);
2734
2735    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2736    XORI_B4_128_SB(src0, src1, src2, src3);
2737    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2738    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2739
2740    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2741    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2742    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2743    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2744    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2745                       weight_vec, rnd_vec, offset_vec,
2746                       dst0, dst1);
2747
2748    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2749    ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2750}
2751
2752static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
2753                                             int32_t src_stride,
2754                                             int16_t *src1_ptr,
2755                                             int32_t src2_stride,
2756                                             uint8_t *dst,
2757                                             int32_t dst_stride,
2758                                             const int8_t *filter,
2759                                             int32_t height,
2760                                             int32_t weight0,
2761                                             int32_t weight1,
2762                                             int32_t offset0,
2763                                             int32_t offset1,
2764                                             int32_t rnd_val)
2765{
2766    uint32_t loop_cnt;
2767    int32_t weight, offset, constant;
2768    v8i16 filt0, filt1;
2769    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2770    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2771    v16i8 mask1;
2772    v16i8 vec0, vec1;
2773    v8i16 dst0, dst1, dst2, dst3;
2774    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2775    v8i16 filter_vec;
2776    v4i32 weight_vec, offset_vec, rnd_vec;
2777
2778    src0_ptr -= 1;
2779
2780    filter_vec = LD_SH(filter);
2781    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2782
2783    offset = (offset0 + offset1) << rnd_val;
2784    weight0 = weight0 & 0x0000FFFF;
2785    weight = weight0 | (weight1 << 16);
2786    constant = 128 * weight1;
2787    constant <<= 6;
2788    offset += constant;
2789
2790    offset_vec = __msa_fill_w(offset);
2791    weight_vec = __msa_fill_w(weight);
2792    rnd_vec = __msa_fill_w(rnd_val + 1);
2793
2794    mask1 = mask0 + 2;
2795
2796    for (loop_cnt = (height >> 3); loop_cnt--;) {
2797        LD_SB8(src0_ptr, src_stride,
2798               src0, src1, src2, src3, src4, src5, src6, src7);
2799        src0_ptr += (8 * src_stride);
2800        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2801        src1_ptr += (4 * src2_stride);
2802        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2803        src1_ptr += (4 * src2_stride);
2804        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2805        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2806        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2807
2808        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2809        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2810        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2811        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2812        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2813        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2814        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2815        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2816        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2817                           in0, in1, in2, in3,
2818                           weight_vec, rnd_vec, offset_vec,
2819                           dst0, dst1, dst2, dst3);
2820
2821        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2822        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2823        dst += (8 * dst_stride);
2824    }
2825}
2826
2827static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
2828                                    int32_t src_stride,
2829                                    int16_t *src1_ptr,
2830                                    int32_t src2_stride,
2831                                    uint8_t *dst,
2832                                    int32_t dst_stride,
2833                                    const int8_t *filter,
2834                                    int32_t height,
2835                                    int32_t weight0,
2836                                    int32_t weight1,
2837                                    int32_t offset0,
2838                                    int32_t offset1,
2839                                    int32_t rnd_val)
2840{
2841    if (2 == height) {
2842        hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2843                                 dst, dst_stride, filter,
2844                                 weight0, weight1, offset0, offset1, rnd_val);
2845    } else if (4 == height) {
2846        hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2847                                 dst, dst_stride, filter,
2848                                 weight0, weight1, offset0, offset1, rnd_val);
2849    } else if (0 == (height % 8)) {
2850        hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2851                                         src1_ptr, src2_stride,
2852                                         dst, dst_stride, filter, height,
2853                                         weight0, weight1, offset0, offset1,
2854                                         rnd_val);
2855    }
2856}
2857
2858static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
2859                                    int32_t src_stride,
2860                                    int16_t *src1_ptr,
2861                                    int32_t src2_stride,
2862                                    uint8_t *dst,
2863                                    int32_t dst_stride,
2864                                    const int8_t *filter,
2865                                    int32_t height,
2866                                    int32_t weight0,
2867                                    int32_t weight1,
2868                                    int32_t offset0,
2869                                    int32_t offset1,
2870                                    int32_t rnd_val)
2871{
2872    uint32_t loop_cnt;
2873    int32_t offset, weight, constant;
2874    v8i16 filt0, filt1;
2875    v16i8 src0, src1, src2, src3;
2876    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2877    v16i8 mask1;
2878    v16i8 vec0, vec1;
2879    v8i16 in0, in1, in2, in3;
2880    v8i16 dst0, dst1, dst2, dst3;
2881    v8i16 filter_vec;
2882    v4i32 weight_vec, offset_vec, rnd_vec;
2883
2884    src0_ptr -= 1;
2885
2886    filter_vec = LD_SH(filter);
2887    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2888
2889    offset = (offset0 + offset1) << rnd_val;
2890    weight0 = weight0 & 0x0000FFFF;
2891    weight = weight0 | (weight1 << 16);
2892    constant = 128 * weight1;
2893    constant <<= 6;
2894    offset += constant;
2895
2896    offset_vec = __msa_fill_w(offset);
2897    weight_vec = __msa_fill_w(weight);
2898    rnd_vec = __msa_fill_w(rnd_val + 1);
2899
2900    mask1 = mask0 + 2;
2901
2902    for (loop_cnt = 2; loop_cnt--;) {
2903        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2904        src0_ptr += (4 * src_stride);
2905        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2906        src1_ptr += (4 * src2_stride);
2907        XORI_B4_128_SB(src0, src1, src2, src3);
2908
2909        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2910        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2911        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2912        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2913        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2914        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2915        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2916        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2917
2918        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2919                           in0, in1, in2, in3,
2920                           weight_vec, rnd_vec, offset_vec,
2921                           dst0, dst1, dst2, dst3);
2922
2923        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2924        ST_W2(dst0, 0, 2, dst, dst_stride);
2925        ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2926        ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2927        ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2928        dst += (4 * dst_stride);
2929    }
2930}
2931
2932static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
2933                                     int32_t src_stride,
2934                                     int16_t *src1_ptr,
2935                                     int32_t src2_stride,
2936                                     uint8_t *dst,
2937                                     int32_t dst_stride,
2938                                     const int8_t *filter,
2939                                     int32_t weight0,
2940                                     int32_t weight1,
2941                                     int32_t offset0,
2942                                     int32_t offset1,
2943                                     int32_t rnd_val)
2944{
2945    int32_t offset, weight, constant;
2946    v8i16 filt0, filt1;
2947    v16i8 src0, src1;
2948    v8i16 in0, in1;
2949    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2950    v16i8 mask1, vec0, vec1;
2951    v8i16 dst0, dst1;
2952    v8i16 filter_vec;
2953    v4i32 weight_vec, offset_vec, rnd_vec;
2954
2955    src0_ptr -= 1;
2956
2957    filter_vec = LD_SH(filter);
2958    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2959
2960    offset = (offset0 + offset1) << rnd_val;
2961    weight0 = weight0 & 0x0000FFFF;
2962    weight = weight0 | (weight1 << 16);
2963    constant = 128 * weight1;
2964    constant <<= 6;
2965    offset += constant;
2966
2967    offset_vec = __msa_fill_w(offset);
2968    weight_vec = __msa_fill_w(weight);
2969    rnd_vec = __msa_fill_w(rnd_val + 1);
2970
2971    mask1 = mask0 + 2;
2972
2973    LD_SB2(src0_ptr, src_stride, src0, src1);
2974    LD_SH2(src1_ptr, src2_stride, in0, in1);
2975    XORI_B2_128_SB(src0, src1);
2976    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2977    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2978    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2979    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2980    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2981                       weight_vec, rnd_vec, offset_vec,
2982                       dst0, dst1);
2983
2984    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2985    ST_D2(dst0, 0, 1, dst, dst_stride);
2986}
2987
2988static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
2989                                     int32_t src_stride,
2990                                     int16_t *src1_ptr,
2991                                     int32_t src2_stride,
2992                                     uint8_t *dst,
2993                                     int32_t dst_stride,
2994                                     const int8_t *filter,
2995                                     int32_t weight0,
2996                                     int32_t weight1,
2997                                     int32_t offset0,
2998                                     int32_t offset1,
2999                                     int32_t rnd_val)
3000{
3001    int32_t weight, offset, constant;
3002    v8i16 filt0, filt1;
3003    v16i8 src0, src1, src2, src3, src4, src5;
3004    v8i16 in0, in1, in2, in3, in4, in5;
3005    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3006    v16i8 mask1;
3007    v16i8 vec0, vec1;
3008    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3009    v8i16 filter_vec;
3010    v4i32 weight_vec, offset_vec, rnd_vec;
3011
3012    src0_ptr -= 1;
3013
3014    filter_vec = LD_SH(filter);
3015    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3016
3017    offset = (offset0 + offset1) << rnd_val;
3018    weight0 = weight0 & 0x0000FFFF;
3019    weight = weight0 | (weight1 << 16);
3020    constant = 128 * weight1;
3021    constant <<= 6;
3022    offset += constant;
3023
3024    offset_vec = __msa_fill_w(offset);
3025    weight_vec = __msa_fill_w(weight);
3026    rnd_vec = __msa_fill_w(rnd_val + 1);
3027
3028    mask1 = mask0 + 2;
3029
3030    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3031
3032    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3033    src1_ptr += (4 * src2_stride);
3034    LD_SH2(src1_ptr, src2_stride, in4, in5);
3035    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3036    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3037    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3038    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3039    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3040    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3041    dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3043    dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3044    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3045    dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3046    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3047    dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3048    HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3049                       in0, in1, in2, in3,
3050                       weight_vec, rnd_vec, offset_vec,
3051                       dst0, dst1, dst2, dst3);
3052    HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3053                       weight_vec, rnd_vec, offset_vec,
3054                       dst4, dst5);
3055
3056    PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3057    dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3058    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3059    ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3060}
3061
3062static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3063                                             int32_t src_stride,
3064                                             int16_t *src1_ptr,
3065                                             int32_t src2_stride,
3066                                             uint8_t *dst,
3067                                             int32_t dst_stride,
3068                                             const int8_t *filter,
3069                                             int32_t height,
3070                                             int32_t weight0,
3071                                             int32_t weight1,
3072                                             int32_t offset0,
3073                                             int32_t offset1,
3074                                             int32_t rnd_val)
3075{
3076    uint32_t loop_cnt;
3077    int32_t offset, weight, constant;
3078    v8i16 filt0, filt1;
3079    v16i8 src0, src1, src2, src3;
3080    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3081    v16i8 mask1;
3082    v16i8 vec0, vec1;
3083    v8i16 in0, in1, in2, in3;
3084    v8i16 dst0, dst1, dst2, dst3;
3085    v8i16 filter_vec;
3086    v4i32 weight_vec, offset_vec, rnd_vec;
3087
3088    src0_ptr -= 1;
3089
3090    filter_vec = LD_SH(filter);
3091    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3092
3093    offset = (offset0 + offset1) << rnd_val;
3094    weight0 = weight0 & 0x0000FFFF;
3095    weight = weight0 | (weight1 << 16);
3096    constant = 128 * weight1;
3097    constant <<= 6;
3098    offset += constant;
3099
3100    offset_vec = __msa_fill_w(offset);
3101    weight_vec = __msa_fill_w(weight);
3102    rnd_vec = __msa_fill_w(rnd_val + 1);
3103
3104    mask1 = mask0 + 2;
3105
3106    for (loop_cnt = (height >> 2); loop_cnt--;) {
3107        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3108        src0_ptr += (4 * src_stride);
3109        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3110        src1_ptr += (4 * src2_stride);
3111        XORI_B4_128_SB(src0, src1, src2, src3);
3112
3113        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3114        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3115        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3116        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3117        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3118        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3119        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3120        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3121        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3122                           in0, in1, in2, in3,
3123                           weight_vec, rnd_vec, offset_vec,
3124                           dst0, dst1, dst2, dst3);
3125
3126        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3127        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3128        dst += (4 * dst_stride);
3129    }
3130}
3131
3132static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
3133                                    int32_t src_stride,
3134                                    int16_t *src1_ptr,
3135                                    int32_t src2_stride,
3136                                    uint8_t *dst,
3137                                    int32_t dst_stride,
3138                                    const int8_t *filter,
3139                                    int32_t height,
3140                                    int32_t weight0,
3141                                    int32_t weight1,
3142                                    int32_t offset0,
3143                                    int32_t offset1,
3144                                    int32_t rnd_val)
3145{
3146    if (2 == height) {
3147        hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3148                                 dst, dst_stride, filter,
3149                                 weight0, weight1, offset0, offset1, rnd_val);
3150    } else if (6 == height) {
3151        hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3152                                 dst, dst_stride, filter,
3153                                 weight0, weight1, offset0, offset1, rnd_val);
3154    } else if (0 == (height % 4)) {
3155        hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3156                                         src1_ptr, src2_stride,
3157                                         dst, dst_stride, filter, height,
3158                                         weight0, weight1, offset0, offset1,
3159                                         rnd_val);
3160    }
3161}
3162
3163static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
3164                                     int32_t src_stride,
3165                                     int16_t *src1_ptr,
3166                                     int32_t src2_stride,
3167                                     uint8_t *dst,
3168                                     int32_t dst_stride,
3169                                     const int8_t *filter,
3170                                     int32_t height,
3171                                     int32_t weight0,
3172                                     int32_t weight1,
3173                                     int32_t offset0,
3174                                     int32_t offset1,
3175                                     int32_t rnd_val)
3176{
3177    uint32_t loop_cnt;
3178    int32_t offset, weight, constant;
3179    v8i16 filt0, filt1;
3180    v16i8 src0, src1, src2, src3;
3181    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3182    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3183    v16i8 mask2 = {
3184        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3185    };
3186    v16i8 mask1, mask3;
3187    v16i8 vec0, vec1;
3188    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3189    v8i16 filter_vec;
3190    v4i32 weight_vec, offset_vec, rnd_vec;
3191
3192    src0_ptr -= 1;
3193
3194    filter_vec = LD_SH(filter);
3195    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3196
3197    offset = (offset0 + offset1) << rnd_val;
3198    weight0 = weight0 & 0x0000FFFF;
3199    weight = weight0 | (weight1 << 16);
3200    constant = 128 * weight1;
3201    constant <<= 6;
3202    offset += constant;
3203
3204    offset_vec = __msa_fill_w(offset);
3205    weight_vec = __msa_fill_w(weight);
3206    rnd_vec = __msa_fill_w(rnd_val + 1);
3207
3208    mask1 = mask0 + 2;
3209    mask3 = mask2 + 2;
3210
3211    for (loop_cnt = 4; loop_cnt--;) {
3212        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3213        src0_ptr += (4 * src_stride);
3214        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3215        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3216        src1_ptr += (4 * src2_stride);
3217        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3218        XORI_B4_128_SB(src0, src1, src2, src3);
3219
3220        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3221        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3222        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3223        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3224        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3225        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3226        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3227        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3228        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3229        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3230        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3231        dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3232
3233        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3234                           in0, in1, in2, in3,
3235                           weight_vec, rnd_vec, offset_vec,
3236                           dst0, dst1, dst2, dst3);
3237        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3238                           weight_vec, rnd_vec, offset_vec,
3239                           dst4, dst5);
3240
3241        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3242        dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3243        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3244        ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3245        dst += (4 * dst_stride);
3246    }
3247}
3248
3249static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
3250                                     int32_t src_stride,
3251                                     int16_t *src1_ptr,
3252                                     int32_t src2_stride,
3253                                     uint8_t *dst,
3254                                     int32_t dst_stride,
3255                                     const int8_t *filter,
3256                                     int32_t height,
3257                                     int32_t weight0,
3258                                     int32_t weight1,
3259                                     int32_t offset0,
3260                                     int32_t offset1,
3261                                     int32_t rnd_val)
3262{
3263    uint32_t loop_cnt;
3264    int32_t offset, weight, constant;
3265    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3266    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3267    v8i16 filt0, filt1;
3268    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3269    v16i8 mask1;
3270    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3271    v16i8 vec0, vec1;
3272    v8i16 filter_vec;
3273    v4i32 weight_vec, offset_vec, rnd_vec;
3274
3275    src0_ptr -= 1;
3276
3277    filter_vec = LD_SH(filter);
3278    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3279
3280    offset = (offset0 + offset1) << rnd_val;
3281    weight0 = weight0 & 0x0000FFFF;
3282    weight = weight0 | (weight1 << 16);
3283    constant = 128 * weight1;
3284    constant <<= 6;
3285    offset += constant;
3286
3287    offset_vec = __msa_fill_w(offset);
3288    weight_vec = __msa_fill_w(weight);
3289    rnd_vec = __msa_fill_w(rnd_val + 1);
3290
3291    mask1 = mask0 + 2;
3292
3293    for (loop_cnt = (height >> 2); loop_cnt--;) {
3294        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3295        LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3296        src0_ptr += (4 * src_stride);
3297        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3298        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3299        src1_ptr += (4 * src2_stride);
3300        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3301
3302        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3303        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3304        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3305        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3306        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3307        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3308        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3309        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3310        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3311        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3312        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3313        dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3314        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3315        dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3316        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3317        dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3318        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3319                           in0, in1, in2, in3,
3320                           weight_vec, rnd_vec, offset_vec,
3321                           dst0, dst1, dst2, dst3);
3322
3323        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3324        ST_SH2(dst0, dst1, dst, dst_stride);
3325        dst += (2 * dst_stride);
3326
3327        HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3328                           in4, in5, in6, in7,
3329                           weight_vec, rnd_vec, offset_vec,
3330                           dst0, dst1, dst2, dst3);
3331
3332        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3333        ST_SH2(dst0, dst1, dst, dst_stride);
3334        dst += (2 * dst_stride);
3335    }
3336}
3337
3338static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
3339                                     int32_t src_stride,
3340                                     int16_t *src1_ptr,
3341                                     int32_t src2_stride,
3342                                     uint8_t *dst,
3343                                     int32_t dst_stride,
3344                                     const int8_t *filter,
3345                                     int32_t height,
3346                                     int32_t weight0,
3347                                     int32_t weight1,
3348                                     int32_t offset0,
3349                                     int32_t offset1,
3350                                     int32_t rnd_val)
3351{
3352    uint32_t loop_cnt;
3353    int32_t offset, weight, constant;
3354    v16i8 src0, src1, src2, src3;
3355    v8i16 filt0, filt1;
3356    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3357    v16i8 mask1, mask2, mask3;
3358    v16i8 vec0, vec1;
3359    v8i16 dst0, dst1, dst2, dst3;
3360    v8i16 in0, in1, in2, in3, in4, in5;
3361    v8i16 filter_vec;
3362    v4i32 weight_vec, offset_vec, rnd_vec;
3363
3364    src0_ptr -= 1;
3365
3366    filter_vec = LD_SH(filter);
3367    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3368
3369    offset = (offset0 + offset1) << rnd_val;
3370    weight0 = weight0 & 0x0000FFFF;
3371    weight = weight0 | (weight1 << 16);
3372    constant = 128 * weight1;
3373    constant <<= 6;
3374    offset += constant;
3375
3376    offset_vec = __msa_fill_w(offset);
3377    weight_vec = __msa_fill_w(weight);
3378    rnd_vec = __msa_fill_w(rnd_val + 1);
3379
3380    mask1 = mask0 + 2;
3381    mask2 = mask0 + 8;
3382    mask3 = mask0 + 10;
3383
3384    for (loop_cnt = 16; loop_cnt--;) {
3385        LD_SB2(src0_ptr, src_stride, src0, src2);
3386        LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3387        src0_ptr += (2 * src_stride);
3388        LD_SH2(src1_ptr, src2_stride, in0, in2);
3389        LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3390        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3391        src1_ptr += (2 * src2_stride);
3392        XORI_B4_128_SB(src0, src1, src2, src3);
3393
3394        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3395        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3396        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3397        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3398        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3399        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3400        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3401        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3402        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3403                           in0, in1, in2, in3,
3404                           weight_vec, rnd_vec, offset_vec,
3405                           dst0, dst1, dst2, dst3);
3406
3407        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3408        ST_SH2(dst0, dst1, dst, dst_stride);
3409
3410        /* 8 width */
3411        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3412        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3413        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3414        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3415        HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3416                           weight_vec, rnd_vec, offset_vec,
3417                           dst0, dst1);
3418
3419        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3420        ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3421        dst += (2 * dst_stride);
3422    }
3423}
3424
3425static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
3426                                     int32_t src_stride,
3427                                     int16_t *src1_ptr,
3428                                     int32_t src2_stride,
3429                                     uint8_t *dst,
3430                                     int32_t dst_stride,
3431                                     const int8_t *filter,
3432                                     int32_t height,
3433                                     int32_t weight0,
3434                                     int32_t weight1,
3435                                     int32_t offset0,
3436                                     int32_t offset1,
3437                                     int32_t rnd_val)
3438{
3439    uint32_t loop_cnt;
3440    int32_t offset, weight, constant;
3441    v16i8 src0, src1, src2;
3442    v8i16 filt0, filt1;
3443    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3444    v16i8 mask1, mask2, mask3;
3445    v8i16 dst0, dst1, dst2, dst3;
3446    v16i8 vec0, vec1;
3447    v8i16 in0, in1, in2, in3;
3448    v8i16 filter_vec;
3449    v4i32 weight_vec, offset_vec, rnd_vec;
3450
3451    src0_ptr -= 1;
3452
3453    filter_vec = LD_SH(filter);
3454    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3455
3456    offset = (offset0 + offset1) << rnd_val;
3457    weight0 = weight0 & 0x0000FFFF;
3458    weight = weight0 | (weight1 << 16);
3459    constant = 128 * weight1;
3460    constant <<= 6;
3461    offset += constant;
3462
3463    offset_vec = __msa_fill_w(offset);
3464    weight_vec = __msa_fill_w(weight);
3465    rnd_vec = __msa_fill_w(rnd_val + 1);
3466
3467    mask1 = mask0 + 2;
3468    mask2 = mask0 + 8;
3469    mask3 = mask0 + 10;
3470
3471    for (loop_cnt = height; loop_cnt--;) {
3472        LD_SB2(src0_ptr, 16, src0, src1);
3473        src2 = LD_SB(src0_ptr + 24);
3474        src0_ptr += src_stride;
3475        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3476        src1_ptr += src2_stride;
3477        XORI_B3_128_SB(src0, src1, src2);
3478
3479        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3480        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3481        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3482        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3483        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3484        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3485        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3486        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3487        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3488                           in0, in1, in2, in3,
3489                           weight_vec, rnd_vec, offset_vec,
3490                           dst0, dst1, dst2, dst3);
3491
3492        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3493        ST_SH2(dst0, dst1, dst, 16);
3494        dst += dst_stride;
3495    }
3496}
3497
3498static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
3499                                     int32_t src_stride,
3500                                     int16_t *src1_ptr,
3501                                     int32_t src2_stride,
3502                                     uint8_t *dst,
3503                                     int32_t dst_stride,
3504                                     const int8_t *filter,
3505                                     int32_t weight0,
3506                                     int32_t weight1,
3507                                     int32_t offset0,
3508                                     int32_t offset1,
3509                                     int32_t rnd_val)
3510{
3511    int32_t weight, offset, constant;
3512    v16i8 src0, src1, src2, src3, src4;
3513    v8i16 in0, in1, dst10;
3514    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3515    v4i32 dst10_r, dst10_l;
3516    v8i16 filt0, filt1;
3517    v8i16 filter_vec, out;
3518    v4i32 weight_vec, offset_vec, rnd_vec;
3519
3520    src0_ptr -= src_stride;
3521
3522    offset = (offset0 + offset1) << rnd_val;
3523    weight0 = weight0 & 0x0000FFFF;
3524    weight = weight0 | (weight1 << 16);
3525    constant = 128 * weight1;
3526    constant <<= 6;
3527    offset += constant;
3528
3529    offset_vec = __msa_fill_w(offset);
3530    weight_vec = __msa_fill_w(weight);
3531    rnd_vec = __msa_fill_w(rnd_val + 1);
3532
3533    filter_vec = LD_SH(filter);
3534    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3535
3536    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3537    src0_ptr += (3 * src_stride);
3538    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3539    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3540    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3541    LD_SB2(src0_ptr, src_stride, src3, src4);
3542    src0_ptr += (2 * src_stride);
3543    LD_SH2(src1_ptr, src2_stride, in0, in1);
3544    src1_ptr += (2 * src2_stride);
3545
3546    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3547    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3548    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3549    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3550
3551    dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3552
3553    ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3554    dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3555    dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3556    SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3557    out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3558    CLIP_SH_0_255(out);
3559    out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3560    ST_W2(out, 0, 1, dst, dst_stride);
3561}
3562
3563static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
3564                                     int32_t src_stride,
3565                                     int16_t *src1_ptr,
3566                                     int32_t src2_stride,
3567                                     uint8_t *dst,
3568                                     int32_t dst_stride,
3569                                     const int8_t *filter,
3570                                     int32_t weight0,
3571                                     int32_t weight1,
3572                                     int32_t offset0,
3573                                     int32_t offset1,
3574                                     int32_t rnd_val)
3575{
3576    int32_t weight, offset, constant;
3577    v16i8 src0, src1, src2, src3, src4, src5, src6;
3578    v8i16 in0, in1, in2, in3;
3579    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3580    v16i8 src2110, src4332, src6554;
3581    v8i16 dst10, dst32;
3582    v8i16 filt0, filt1;
3583    v8i16 filter_vec;
3584    v4i32 weight_vec, offset_vec, rnd_vec;
3585
3586    src0_ptr -= src_stride;
3587
3588    offset = (offset0 + offset1) << rnd_val;
3589    weight0 = weight0 & 0x0000FFFF;
3590    weight = weight0 | (weight1 << 16);
3591    constant = 128 * weight1;
3592    constant <<= 6;
3593    offset += constant;
3594
3595    offset_vec = __msa_fill_w(offset);
3596    weight_vec = __msa_fill_w(weight);
3597    rnd_vec = __msa_fill_w(rnd_val + 1);
3598
3599    filter_vec = LD_SH(filter);
3600    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3601
3602    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3603    src0_ptr += (3 * src_stride);
3604    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3605    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3606    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3607
3608    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3609    src0_ptr += (4 * src_stride);
3610    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3611    src1_ptr += (4 * src2_stride);
3612    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3613    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3614               src32_r, src43_r, src54_r, src65_r);
3615    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3616    XORI_B2_128_SB(src4332, src6554);
3617
3618    dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3619    dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3620
3621    HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3622                       weight_vec, rnd_vec, offset_vec,
3623                       dst10, dst32);
3624
3625    dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3626    ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3627    dst += (4 * dst_stride);
3628}
3629
3630static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
3631                                             int32_t src_stride,
3632                                             int16_t *src1_ptr,
3633                                             int32_t src2_stride,
3634                                             uint8_t *dst,
3635                                             int32_t dst_stride,
3636                                             const int8_t *filter,
3637                                             int32_t height,
3638                                             int32_t weight0,
3639                                             int32_t weight1,
3640                                             int32_t offset0,
3641                                             int32_t offset1,
3642                                             int32_t rnd_val)
3643{
3644    uint32_t loop_cnt;
3645    int32_t weight, offset, constant;
3646    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3647    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3648    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3649    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3650    v16i8 src2110, src4332, src6554, src8776;
3651    v8i16 dst10, dst32, dst54, dst76;
3652    v8i16 filt0, filt1;
3653    v8i16 filter_vec;
3654    v4i32 weight_vec, offset_vec, rnd_vec;
3655
3656    src0_ptr -= src_stride;
3657
3658    offset = (offset0 + offset1) << rnd_val;
3659    weight0 = weight0 & 0x0000FFFF;
3660    weight = weight0 | (weight1 << 16);
3661    constant = 128 * weight1;
3662    constant <<= 6;
3663    offset += constant;
3664
3665    offset_vec = __msa_fill_w(offset);
3666    weight_vec = __msa_fill_w(weight);
3667    rnd_vec = __msa_fill_w(rnd_val + 1);
3668
3669    filter_vec = LD_SH(filter);
3670    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3671
3672    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3673    src0_ptr += (3 * src_stride);
3674    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3675    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3676    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3677
3678    for (loop_cnt = (height >> 3); loop_cnt--;) {
3679        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3680        src0_ptr += (6 * src_stride);
3681        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3682        src1_ptr += (8 * src2_stride);
3683
3684        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3685        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3686
3687        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3688                   src32_r, src43_r, src54_r, src65_r);
3689        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3690        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3691                   src4332, src6554, src8776);
3692        XORI_B3_128_SB(src4332, src6554, src8776);
3693
3694        dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3695        dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3696        dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3697
3698        LD_SB2(src0_ptr, src_stride, src9, src2);
3699        src0_ptr += (2 * src_stride);
3700        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3701        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3702        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3703
3704        dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3705        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3706                           in0, in1, in2, in3,
3707                           weight_vec, rnd_vec, offset_vec,
3708                           dst10, dst32, dst54, dst76);
3709
3710        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3711        ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3712        dst += (8 * dst_stride);
3713    }
3714}
3715
3716static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
3717                                    int32_t src_stride,
3718                                    int16_t *src1_ptr,
3719                                    int32_t src2_stride,
3720                                    uint8_t *dst,
3721                                    int32_t dst_stride,
3722                                    const int8_t *filter,
3723                                    int32_t height,
3724                                    int32_t weight0,
3725                                    int32_t weight1,
3726                                    int32_t offset0,
3727                                    int32_t offset1,
3728                                    int32_t rnd_val)
3729{
3730    if (2 == height) {
3731        hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3732                                 dst, dst_stride, filter,
3733                                 weight0, weight1, offset0, offset1, rnd_val);
3734    } else if (4 == height) {
3735        hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3736                                 dst, dst_stride, filter,
3737                                 weight0, weight1, offset0, offset1, rnd_val);
3738    } else if (0 == (height % 8)) {
3739        hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3740                                         src1_ptr, src2_stride,
3741                                         dst, dst_stride, filter, height,
3742                                         weight0, weight1, offset0, offset1,
3743                                         rnd_val);
3744    }
3745}
3746
3747static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
3748                                    int32_t src_stride,
3749                                    int16_t *src1_ptr,
3750                                    int32_t src2_stride,
3751                                    uint8_t *dst,
3752                                    int32_t dst_stride,
3753                                    const int8_t *filter,
3754                                    int32_t height,
3755                                    int32_t weight0,
3756                                    int32_t weight1,
3757                                    int32_t offset0,
3758                                    int32_t offset1,
3759                                    int32_t rnd_val)
3760{
3761    uint32_t loop_cnt;
3762    int32_t offset, weight, constant;
3763    v16i8 src0, src1, src2, src3, src4;
3764    v8i16 in0, in1, in2, in3;
3765    v16i8 src10_r, src32_r, src21_r, src43_r;
3766    v8i16 tmp0, tmp1, tmp2, tmp3;
3767    v8i16 filt0, filt1;
3768    v8i16 filter_vec;
3769    v4i32 weight_vec, offset_vec, rnd_vec;
3770
3771    src0_ptr -= src_stride;
3772
3773    offset = (offset0 + offset1) << rnd_val;
3774    weight0 = weight0 & 0x0000FFFF;
3775    weight = weight0 | (weight1 << 16);
3776    constant = 128 * weight1;
3777    constant <<= 6;
3778    offset += constant;
3779
3780    offset_vec = __msa_fill_w(offset);
3781    weight_vec = __msa_fill_w(weight);
3782    rnd_vec = __msa_fill_w(rnd_val + 1);
3783
3784    filter_vec = LD_SH(filter);
3785    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3786
3787    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3788    src0_ptr += (3 * src_stride);
3789    XORI_B3_128_SB(src0, src1, src2);
3790    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3791
3792    for (loop_cnt = (height >> 2); loop_cnt--;) {
3793        LD_SB2(src0_ptr, src_stride, src3, src4);
3794        src0_ptr += (2 * src_stride);
3795        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3796        src1_ptr += (4 * src2_stride);
3797        XORI_B2_128_SB(src3, src4);
3798        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3799
3800        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3801        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3802
3803        LD_SB2(src0_ptr, src_stride, src1, src2);
3804        src0_ptr += (2 * src_stride);
3805        XORI_B2_128_SB(src1, src2);
3806        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3807
3808        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3809        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3810        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3811                           in0, in1, in2, in3,
3812                           weight_vec, rnd_vec, offset_vec,
3813                           tmp0, tmp1, tmp2, tmp3);
3814
3815        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3816        ST_W2(tmp0, 0, 2, dst, dst_stride);
3817        ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3818        ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3819        ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3820        dst += (4 * dst_stride);
3821    }
3822}
3823
3824static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
3825                                     int32_t src_stride,
3826                                     int16_t *src1_ptr,
3827                                     int32_t src2_stride,
3828                                     uint8_t *dst,
3829                                     int32_t dst_stride,
3830                                     const int8_t *filter,
3831                                     int32_t weight0,
3832                                     int32_t weight1,
3833                                     int32_t offset0,
3834                                     int32_t offset1,
3835                                     int32_t rnd_val)
3836{
3837    int32_t offset, weight, constant;
3838    v16i8 src0, src1, src2, src3, src4;
3839    v8i16 in0, in1, tmp0, tmp1;
3840    v16i8 src10_r, src32_r, src21_r, src43_r;
3841    v8i16 filt0, filt1;
3842    v8i16 filter_vec;
3843    v4i32 weight_vec, offset_vec, rnd_vec;
3844
3845    src0_ptr -= src_stride;
3846
3847    offset = (offset0 + offset1) << rnd_val;
3848    weight0 = weight0 & 0x0000FFFF;
3849    weight = weight0 | (weight1 << 16);
3850    constant = 128 * weight1;
3851    constant <<= 6;
3852    offset += constant;
3853
3854    offset_vec = __msa_fill_w(offset);
3855    weight_vec = __msa_fill_w(weight);
3856    rnd_vec = __msa_fill_w(rnd_val + 1);
3857
3858    filter_vec = LD_SH(filter);
3859    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3860
3861    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3862    src0_ptr += (3 * src_stride);
3863    XORI_B3_128_SB(src0, src1, src2);
3864    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3865
3866    LD_SB2(src0_ptr, src_stride, src3, src4);
3867    LD_SH2(src1_ptr, src2_stride, in0, in1);
3868    XORI_B2_128_SB(src3, src4);
3869    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3870
3871    tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3872    tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3873    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3874                       weight_vec, rnd_vec, offset_vec,
3875                       tmp0, tmp1);
3876
3877    tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3878    ST_D2(tmp0, 0, 1, dst, dst_stride);
3879}
3880
3881static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
3882                                     int32_t src_stride,
3883                                     int16_t *src1_ptr,
3884                                     int32_t src2_stride,
3885                                     uint8_t *dst,
3886                                     int32_t dst_stride,
3887                                     const int8_t *filter,
3888                                     int32_t weight0,
3889                                     int32_t weight1,
3890                                     int32_t offset0,
3891                                     int32_t offset1,
3892                                     int32_t rnd_val)
3893{
3894    int32_t offset, weight, constant;
3895    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3896    v8i16 in0, in1, in2, in3, in4, in5;
3897    v16i8 src10_r, src32_r, src54_r, src76_r;
3898    v16i8 src21_r, src43_r, src65_r, src87_r;
3899    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3900    v8i16 filt0, filt1;
3901    v8i16 filter_vec;
3902    v4i32 weight_vec, offset_vec, rnd_vec;
3903
3904    src0_ptr -= src_stride;
3905
3906    offset = (offset0 + offset1) << rnd_val;
3907    weight0 = weight0 & 0x0000FFFF;
3908    weight = weight0 | (weight1 << 16);
3909    constant = 128 * weight1;
3910    constant <<= 6;
3911    offset += constant;
3912
3913    offset_vec = __msa_fill_w(offset);
3914    weight_vec = __msa_fill_w(weight);
3915    rnd_vec = __msa_fill_w(rnd_val + 1);
3916
3917    filter_vec = LD_SH(filter);
3918    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3919
3920    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3921    src0_ptr += (3 * src_stride);
3922    XORI_B3_128_SB(src0, src1, src2);
3923    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3924
3925    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3926    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3927    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3928    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3929               src32_r, src43_r, src54_r, src65_r);
3930    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3931
3932    tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3933    tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3934    tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3935    tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3936    tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3937    tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3938    HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3939                       in0, in1, in2, in3,
3940                       weight_vec, rnd_vec, offset_vec,
3941                       tmp0, tmp1, tmp2, tmp3);
3942    HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3943                       weight_vec, rnd_vec, offset_vec,
3944                       tmp4, tmp5);
3945
3946    PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3947    tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3948    ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3949    ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
3950}
3951
3952static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3953                                             int32_t src_stride,
3954                                             int16_t *src1_ptr,
3955                                             int32_t src2_stride,
3956                                             uint8_t *dst,
3957                                             int32_t dst_stride,
3958                                             const int8_t *filter,
3959                                             int32_t height,
3960                                             int32_t weight0,
3961                                             int32_t weight1,
3962                                             int32_t offset0,
3963                                             int32_t offset1,
3964                                             int32_t rnd_val)
3965{
3966    uint32_t loop_cnt;
3967    int32_t offset, weight, constant;
3968    v16i8 src0, src1, src2, src3, src4;
3969    v8i16 in0, in1, in2, in3;
3970    v16i8 src10_r, src32_r, src21_r, src43_r;
3971    v8i16 tmp0, tmp1, tmp2, tmp3;
3972    v8i16 filt0, filt1;
3973    v8i16 filter_vec;
3974    v4i32 weight_vec, offset_vec, rnd_vec;
3975
3976    src0_ptr -= src_stride;
3977
3978    offset = (offset0 + offset1) << rnd_val;
3979    weight0 = weight0 & 0x0000FFFF;
3980    weight = weight0 | (weight1 << 16);
3981    constant = 128 * weight1;
3982    constant <<= 6;
3983    offset += constant;
3984
3985    offset_vec = __msa_fill_w(offset);
3986    weight_vec = __msa_fill_w(weight);
3987    rnd_vec = __msa_fill_w(rnd_val + 1);
3988
3989    filter_vec = LD_SH(filter);
3990    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3991
3992    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3993    src0_ptr += (3 * src_stride);
3994    XORI_B3_128_SB(src0, src1, src2);
3995    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3996
3997    for (loop_cnt = (height >> 2); loop_cnt--;) {
3998        LD_SB2(src0_ptr, src_stride, src3, src4);
3999        src0_ptr += (2 * src_stride);
4000        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4001        src1_ptr += (4 * src2_stride);
4002        XORI_B2_128_SB(src3, src4);
4003        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4004
4005        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4006        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4007
4008        LD_SB2(src0_ptr, src_stride, src1, src2);
4009        src0_ptr += (2 * src_stride);
4010        XORI_B2_128_SB(src1, src2);
4011        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4012
4013        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4014        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4015        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4016                           in0, in1, in2, in3,
4017                           weight_vec, rnd_vec, offset_vec,
4018                           tmp0, tmp1, tmp2, tmp3);
4019
4020        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4021        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4022        dst += (4 * dst_stride);
4023    }
4024}
4025
4026static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
4027                                    int32_t src_stride,
4028                                    int16_t *src1_ptr,
4029                                    int32_t src2_stride,
4030                                    uint8_t *dst,
4031                                    int32_t dst_stride,
4032                                    const int8_t *filter,
4033                                    int32_t height,
4034                                    int32_t weight0,
4035                                    int32_t weight1,
4036                                    int32_t offset0,
4037                                    int32_t offset1,
4038                                    int32_t rnd_val)
4039{
4040    if (2 == height) {
4041        hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4042                                 dst, dst_stride, filter,
4043                                 weight0, weight1, offset0, offset1, rnd_val);
4044    } else if (6 == height) {
4045        hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4046                                 dst, dst_stride, filter,
4047                                 weight0, weight1, offset0, offset1, rnd_val);
4048    } else {
4049        hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4050                                         src1_ptr, src2_stride,
4051                                         dst, dst_stride, filter, height,
4052                                         weight0, weight1, offset0, offset1,
4053                                         rnd_val);
4054    }
4055}
4056
4057static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
4058                                     int32_t src_stride,
4059                                     int16_t *src1_ptr,
4060                                     int32_t src2_stride,
4061                                     uint8_t *dst,
4062                                     int32_t dst_stride,
4063                                     const int8_t *filter,
4064                                     int32_t height,
4065                                     int32_t weight0,
4066                                     int32_t weight1,
4067                                     int32_t offset0,
4068                                     int32_t offset1,
4069                                     int32_t rnd_val)
4070{
4071    uint32_t loop_cnt;
4072    int32_t offset, weight, constant;
4073    v16i8 src0, src1, src2, src3, src4, src5;
4074    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4075    v16i8 src10_r, src32_r, src21_r, src43_r;
4076    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4077    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4078    v16i8 src2110, src4332;
4079    v8i16 filt0, filt1;
4080    v8i16 filter_vec;
4081    v4i32 weight_vec, offset_vec, rnd_vec;
4082
4083    src0_ptr -= (1 * src_stride);
4084
4085    offset = (offset0 + offset1) << rnd_val;
4086    weight0 = weight0 & 0x0000FFFF;
4087    weight = weight0 | (weight1 << 16);
4088    constant = 128 * weight1;
4089    constant <<= 6;
4090    offset += constant;
4091
4092    offset_vec = __msa_fill_w(offset);
4093    weight_vec = __msa_fill_w(weight);
4094    rnd_vec = __msa_fill_w(rnd_val + 1);
4095
4096    filter_vec = LD_SH(filter);
4097    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4098
4099    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4100    src0_ptr += (3 * src_stride);
4101    XORI_B3_128_SB(src0, src1, src2);
4102    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4103    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4104    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4105
4106    for (loop_cnt = (height >> 2); loop_cnt--;) {
4107        LD_SB2(src0_ptr, src_stride, src3, src4);
4108        src0_ptr += (2 * src_stride);
4109        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4110        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4111        src1_ptr += (4 * src2_stride);
4112        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4113        XORI_B2_128_SB(src3, src4);
4114
4115        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4116        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4117        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4118
4119        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4120        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4121        tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4122
4123        LD_SB2(src0_ptr, src_stride, src5, src2);
4124        src0_ptr += (2 * src_stride);
4125        XORI_B2_128_SB(src5, src2);
4126        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4127        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4128        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4129
4130        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4131        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4132        tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4133        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4134                           in0, in1, in2, in3,
4135                           weight_vec, rnd_vec, offset_vec,
4136                           tmp0, tmp1, tmp2, tmp3);
4137        HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4138                           weight_vec, rnd_vec, offset_vec,
4139                           tmp4, tmp5);
4140
4141        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4142        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4143        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4144        ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4145        dst += (4 * dst_stride);
4146    }
4147}
4148
4149static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
4150                                     int32_t src_stride,
4151                                     int16_t *src1_ptr,
4152                                     int32_t src2_stride,
4153                                     uint8_t *dst,
4154                                     int32_t dst_stride,
4155                                     const int8_t *filter,
4156                                     int32_t height,
4157                                     int32_t weight0,
4158                                     int32_t weight1,
4159                                     int32_t offset0,
4160                                     int32_t offset1,
4161                                     int32_t rnd_val)
4162{
4163    uint32_t loop_cnt;
4164    int32_t offset, weight, constant;
4165    v16i8 src0, src1, src2, src3, src4, src5;
4166    v8i16 in0, in1, in2, in3;
4167    v16i8 src10_r, src32_r, src21_r, src43_r;
4168    v16i8 src10_l, src32_l, src21_l, src43_l;
4169    v8i16 tmp0, tmp1, tmp2, tmp3;
4170    v8i16 filt0, filt1;
4171    v8i16 filter_vec;
4172    v4i32 weight_vec, offset_vec, rnd_vec;
4173
4174    src0_ptr -= src_stride;
4175
4176    offset = (offset0 + offset1) << rnd_val;
4177    weight0 = weight0 & 0x0000FFFF;
4178    weight = weight0 | (weight1 << 16);
4179    constant = 128 * weight1;
4180    constant <<= 6;
4181    offset += constant;
4182
4183    offset_vec = __msa_fill_w(offset);
4184    weight_vec = __msa_fill_w(weight);
4185    rnd_vec = __msa_fill_w(rnd_val + 1);
4186
4187    filter_vec = LD_SH(filter);
4188    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4189
4190    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4191    src0_ptr += (3 * src_stride);
4192    XORI_B3_128_SB(src0, src1, src2);
4193    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4194    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4195
4196    for (loop_cnt = (height >> 2); loop_cnt--;) {
4197        LD_SB2(src0_ptr, src_stride, src3, src4);
4198        src0_ptr += (2 * src_stride);
4199        LD_SH2(src1_ptr, src2_stride, in0, in1);
4200        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4201        src1_ptr += (2 * src2_stride);
4202        XORI_B2_128_SB(src3, src4);
4203        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4204        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4205
4206        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4207        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4208        tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4209        tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4210
4211        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4212                           in0, in1, in2, in3,
4213                           weight_vec, rnd_vec, offset_vec,
4214                           tmp0, tmp1, tmp2, tmp3);
4215        PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4216        ST_SH2(tmp0, tmp1, dst, dst_stride);
4217        dst += (2 * dst_stride);
4218        LD_SB2(src0_ptr, src_stride, src5, src2);
4219        src0_ptr += (2 * src_stride);
4220
4221        LD_SH2(src1_ptr, src2_stride, in0, in1);
4222        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4223        src1_ptr += (2 * src2_stride);
4224        XORI_B2_128_SB(src5, src2);
4225        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4226        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4227
4228        tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4229        tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4230        tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4231        tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4232        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4233                           in0, in1, in2, in3,
4234                           weight_vec, rnd_vec, offset_vec,
4235                           tmp0, tmp1, tmp2, tmp3);
4236
4237        PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4238        ST_SH2(tmp0, tmp1, dst, dst_stride);
4239        dst += (2 * dst_stride);
4240    }
4241}
4242
4243static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
4244                                     int32_t src_stride,
4245                                     int16_t *src1_ptr,
4246                                     int32_t src2_stride,
4247                                     uint8_t *dst,
4248                                     int32_t dst_stride,
4249                                     const int8_t *filter,
4250                                     int32_t height,
4251                                     int32_t weight0,
4252                                     int32_t weight1,
4253                                     int32_t offset0,
4254                                     int32_t offset1,
4255                                     int32_t rnd_val)
4256{
4257    uint32_t loop_cnt;
4258    int32_t offset, weight, constant;
4259    v16i8 src0, src1, src2, src3, src4, src5;
4260    v16i8 src6, src7, src8, src9, src10, src11;
4261    v8i16 in0, in1, in2, in3, in4, in5;
4262    v16i8 src10_r, src32_r, src76_r, src98_r;
4263    v16i8 src10_l, src32_l, src21_l, src43_l;
4264    v16i8 src21_r, src43_r, src87_r, src109_r;
4265    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4266    v8i16 filt0, filt1;
4267    v8i16 filter_vec;
4268    v4i32 weight_vec, offset_vec, rnd_vec;
4269
4270    src0_ptr -= src_stride;
4271
4272    offset = (offset0 + offset1) << rnd_val;
4273    weight0 = weight0 & 0x0000FFFF;
4274    weight = weight0 | (weight1 << 16);
4275    constant = 128 * weight1;
4276    constant <<= 6;
4277    offset += constant;
4278
4279    offset_vec = __msa_fill_w(offset);
4280    weight_vec = __msa_fill_w(weight);
4281    rnd_vec = __msa_fill_w(rnd_val + 1);
4282
4283    filter_vec = LD_SH(filter);
4284    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4285
4286    /* 16width */
4287    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4288    XORI_B3_128_SB(src0, src1, src2);
4289    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4290    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4291    /* 8width */
4292    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4293    src0_ptr += (3 * src_stride);
4294    XORI_B3_128_SB(src6, src7, src8);
4295    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4296
4297    for (loop_cnt = (height >> 2); loop_cnt--;) {
4298        /* 16width */
4299        LD_SB2(src0_ptr, src_stride, src3, src4);
4300        LD_SH2(src1_ptr, src2_stride, in0, in1);
4301        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4302        XORI_B2_128_SB(src3, src4);
4303        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4304        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4305
4306        /* 8width */
4307        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4308        src0_ptr += (2 * src_stride);
4309        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4310        src1_ptr += (2 * src2_stride);
4311        XORI_B2_128_SB(src9, src10);
4312        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4313        /* 16width */
4314        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4315        tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4316        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4317        tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4318        /* 8width */
4319        tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4320        tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4321        /* 16width */
4322        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4323                           in0, in1, in2, in3,
4324                           weight_vec, rnd_vec, offset_vec,
4325                           tmp0, tmp1, tmp4, tmp5);
4326        /* 8width */
4327        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4328                           weight_vec, rnd_vec, offset_vec,
4329                           tmp2, tmp3);
4330        /* 16width */
4331        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4332        /* 8width */
4333        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4334        ST_SH2(tmp0, tmp1, dst, dst_stride);
4335        ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4336        dst += (2 * dst_stride);
4337
4338        /* 16width */
4339        LD_SB2(src0_ptr, src_stride, src5, src2);
4340        LD_SH2(src1_ptr, src2_stride, in0, in1);
4341        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4342        XORI_B2_128_SB(src5, src2);
4343        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4344        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4345        /* 8width */
4346        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4347        src0_ptr += (2 * src_stride);
4348        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4349        src1_ptr += (2 * src2_stride);
4350        XORI_B2_128_SB(src11, src8);
4351        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4352        /* 16width */
4353        tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4354        tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4355        tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4356        tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4357        /* 8width */
4358        tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4359        tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4360        /* 16width */
4361        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4362                           in0, in1, in2, in3,
4363                           weight_vec, rnd_vec, offset_vec,
4364                           tmp0, tmp1, tmp4, tmp5);
4365        /* 8width */
4366        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4367                           weight_vec, rnd_vec, offset_vec,
4368                           tmp2, tmp3);
4369        /* 16width */
4370        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4371
4372        /* 8width */
4373        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4374        ST_SH2(tmp0, tmp1, dst, dst_stride);
4375        ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4376        dst += (2 * dst_stride);
4377    }
4378}
4379
4380static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
4381                                     int32_t src_stride,
4382                                     int16_t *src1_ptr,
4383                                     int32_t src2_stride,
4384                                     uint8_t *dst,
4385                                     int32_t dst_stride,
4386                                     const int8_t *filter,
4387                                     int32_t height,
4388                                     int32_t weight0,
4389                                     int32_t weight1,
4390                                     int32_t offset0,
4391                                     int32_t offset1,
4392                                     int32_t rnd_val)
4393{
4394    uint32_t loop_cnt;
4395    uint8_t *dst_tmp = dst + 16;
4396    int32_t offset, weight, constant;
4397    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4398    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4399    v16i8 src10_r, src32_r, src76_r, src98_r;
4400    v16i8 src21_r, src43_r, src87_r, src109_r;
4401    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4402    v16i8 src10_l, src32_l, src76_l, src98_l;
4403    v16i8 src21_l, src43_l, src87_l, src109_l;
4404    v8i16 filt0, filt1;
4405    v8i16 filter_vec;
4406    v4i32 weight_vec, offset_vec, rnd_vec;
4407
4408    src0_ptr -= src_stride;
4409
4410    offset = (offset0 + offset1) << rnd_val;
4411    weight0 = weight0 & 0x0000FFFF;
4412    weight = weight0 | (weight1 << 16);
4413    constant = 128 * weight1;
4414    constant <<= 6;
4415    offset += constant;
4416
4417    offset_vec = __msa_fill_w(offset);
4418    weight_vec = __msa_fill_w(weight);
4419    rnd_vec = __msa_fill_w(rnd_val + 1);
4420
4421    filter_vec = LD_SH(filter);
4422    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4423
4424    /* 16width */
4425    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4426    XORI_B3_128_SB(src0, src1, src2);
4427    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4428    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4429    /* next 16width */
4430    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4431    src0_ptr += (3 * src_stride);
4432    XORI_B3_128_SB(src6, src7, src8);
4433    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4434    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4435
4436    for (loop_cnt = (height >> 1); loop_cnt--;) {
4437        /* 16width */
4438        LD_SB2(src0_ptr, src_stride, src3, src4);
4439        LD_SH2(src1_ptr, src2_stride, in0, in1);
4440        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4441        XORI_B2_128_SB(src3, src4);
4442        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4443        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4444
4445        /* 16width */
4446        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4447        tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4448        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4449        tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4450        /* 16width */
4451        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4452                           in0, in1, in2, in3,
4453                           weight_vec, rnd_vec, offset_vec,
4454                           tmp0, tmp1, tmp4, tmp5);
4455        /* 16width */
4456        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4457        ST_SH2(tmp0, tmp1, dst, dst_stride);
4458        dst += (2 * dst_stride);
4459
4460        src10_r = src32_r;
4461        src21_r = src43_r;
4462        src10_l = src32_l;
4463        src21_l = src43_l;
4464        src2 = src4;
4465
4466        /* next 16width */
4467        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4468        src0_ptr += (2 * src_stride);
4469        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4470        LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4471        src1_ptr += (2 * src2_stride);
4472        XORI_B2_128_SB(src9, src10);
4473        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4474        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4475        /* next 16width */
4476        tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4477        tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4478        tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4479        tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4480        /* next 16width */
4481        HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4482                           in4, in5, in6, in7,
4483                           weight_vec, rnd_vec, offset_vec,
4484                           tmp2, tmp3, tmp6, tmp7);
4485
4486        /* next 16width */
4487        PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4488        ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4489        dst_tmp += (2 * dst_stride);
4490
4491        src76_r = src98_r;
4492        src87_r = src109_r;
4493        src76_l = src98_l;
4494        src87_l = src109_l;
4495        src8 = src10;
4496    }
4497}
4498
4499static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
4500                                     int32_t src_stride,
4501                                     int16_t *src1_ptr,
4502                                     int32_t src2_stride,
4503                                     uint8_t *dst,
4504                                     int32_t dst_stride,
4505                                     const int8_t *filter_x,
4506                                     const int8_t *filter_y,
4507                                     int32_t weight0,
4508                                     int32_t weight1,
4509                                     int32_t offset0,
4510                                     int32_t offset1,
4511                                     int32_t rnd_val)
4512{
4513    uint64_t tp0, tp1;
4514    int32_t offset, weight;
4515    v8i16 in0 = { 0 };
4516    v16u8 out;
4517    v16i8 src0, src1, src2, src3, src4;
4518    v8i16 filt0, filt1;
4519    v8i16 filt_h0, filt_h1;
4520    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4521    v16i8 mask1;
4522    v8i16 filter_vec, tmp, weight_vec;
4523    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4524    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4525    v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4526
4527    src0_ptr -= (src_stride + 1);
4528
4529    filter_vec = LD_SH(filter_x);
4530    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4531
4532    filter_vec = LD_SH(filter_y);
4533    UNPCK_R_SB_SH(filter_vec, filter_vec);
4534
4535    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4536
4537    mask1 = mask0 + 2;
4538
4539    offset = (offset0 + offset1) << rnd_val;
4540    weight0 = weight0 & 0x0000FFFF;
4541    weight = weight0 | (weight1 << 16);
4542
4543    const_vec = __msa_fill_w((128 * weight1));
4544    const_vec <<= 6;
4545    offset_vec = __msa_fill_w(offset);
4546    weight_vec = (v8i16) __msa_fill_w(weight);
4547    rnd_vec = __msa_fill_w(rnd_val + 1);
4548    offset_vec += const_vec;
4549
4550    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4551    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4552
4553    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4554    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4555    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4556
4557    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4558    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4559    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4560
4561    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4562    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4563
4564    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4565    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4566    dst0 >>= 6;
4567    dst1 >>= 6;
4568    dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4569
4570    LD2(src1_ptr, src2_stride, tp0, tp1);
4571    INSERT_D2_SH(tp0, tp1, in0);
4572
4573    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4574    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4575    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4576    SRAR_W2_SW(dst0, dst1, rnd_vec);
4577    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4578    CLIP_SH_0_255(tmp);
4579    out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4580    ST_W2(out, 0, 1, dst, dst_stride);
4581}
4582
4583static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
4584                                     int32_t src_stride,
4585                                     int16_t *src1_ptr,
4586                                     int32_t src2_stride,
4587                                     uint8_t *dst,
4588                                     int32_t dst_stride,
4589                                     const int8_t *filter_x,
4590                                     const int8_t *filter_y,
4591                                     int32_t weight0,
4592                                     int32_t weight1,
4593                                     int32_t offset0,
4594                                     int32_t offset1,
4595                                     int32_t rnd_val)
4596{
4597    uint64_t tp0, tp1;
4598    int32_t offset, weight;
4599    v16u8 out;
4600    v8i16 in0 = { 0 }, in1 = { 0 };
4601    v16i8 src0, src1, src2, src3, src4, src5, src6;
4602    v8i16 filt0, filt1;
4603    v8i16 filt_h0, filt_h1;
4604    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4605    v16i8 mask1;
4606    v8i16 filter_vec, weight_vec;
4607    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4608    v8i16 tmp0, tmp1, tmp2, tmp3;
4609    v8i16 dst30, dst41, dst52, dst63;
4610    v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4611    v4i32 offset_vec, rnd_vec, const_vec;
4612    v4i32 dst0, dst1, dst2, dst3;
4613
4614    src0_ptr -= (src_stride + 1);
4615
4616    filter_vec = LD_SH(filter_x);
4617    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4618
4619    filter_vec = LD_SH(filter_y);
4620    UNPCK_R_SB_SH(filter_vec, filter_vec);
4621
4622    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4623
4624    mask1 = mask0 + 2;
4625
4626    offset = (offset0 + offset1) << rnd_val;
4627    weight0 = weight0 & 0x0000FFFF;
4628    weight = weight0 | (weight1 << 16);
4629
4630    const_vec = __msa_fill_w((128 * weight1));
4631    const_vec <<= 6;
4632    offset_vec = __msa_fill_w(offset);
4633    weight_vec = (v8i16) __msa_fill_w(weight);
4634    rnd_vec = __msa_fill_w(rnd_val + 1);
4635    offset_vec += const_vec;
4636
4637    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4638    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4639
4640    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4641    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4642    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4643    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4644
4645    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4646    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4647    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4648    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4649
4650    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4651    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4652    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4653    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4654    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4655    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4656    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4657    SRA_4V(dst0, dst1, dst2, dst3, 6);
4658    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4659
4660    LD2(src1_ptr, src2_stride, tp0, tp1);
4661    INSERT_D2_SH(tp0, tp1, in0);
4662    src1_ptr += (2 * src2_stride);
4663    LD2(src1_ptr, src2_stride, tp0, tp1);
4664    INSERT_D2_SH(tp0, tp1, in1);
4665
4666    ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4667    ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4668
4669    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4670    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4671    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4672    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4673    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4674    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4675    CLIP_SH2_0_255(tmp0, tmp1);
4676    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4677    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4678}
4679
4680static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
4681                                             int32_t src_stride,
4682                                             int16_t *src1_ptr,
4683                                             int32_t src2_stride,
4684                                             uint8_t *dst,
4685                                             int32_t dst_stride,
4686                                             const int8_t *filter_x,
4687                                             const int8_t *filter_y,
4688                                             int32_t height,
4689                                             int32_t weight0,
4690                                             int32_t weight1,
4691                                             int32_t offset0,
4692                                             int32_t offset1,
4693                                             int32_t rnd_val)
4694{
4695    uint32_t loop_cnt;
4696    uint64_t tp0, tp1;
4697    int32_t offset, weight;
4698    v16u8 out0, out1;
4699    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4700    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4701    v8i16 filt0, filt1;
4702    v8i16 filt_h0, filt_h1;
4703    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4704    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4705    v16i8 mask1;
4706    v8i16 filter_vec, weight_vec;
4707    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4708    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4709    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4710    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4711    v8i16 dst98_r, dst109_r;
4712    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4713    v4i32 offset_vec, rnd_vec, const_vec;
4714
4715    src0_ptr -= (src_stride + 1);
4716
4717    filter_vec = LD_SH(filter_x);
4718    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4719
4720    filter_vec = LD_SH(filter_y);
4721    UNPCK_R_SB_SH(filter_vec, filter_vec);
4722
4723    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4724
4725    mask1 = mask0 + 2;
4726
4727    offset = (offset0 + offset1) << rnd_val;
4728    weight0 = weight0 & 0x0000FFFF;
4729    weight = weight0 | (weight1 << 16);
4730
4731    const_vec = __msa_fill_w((128 * weight1));
4732    const_vec <<= 6;
4733    offset_vec = __msa_fill_w(offset);
4734    weight_vec = (v8i16) __msa_fill_w(weight);
4735    rnd_vec = __msa_fill_w(rnd_val + 1);
4736    offset_vec += const_vec;
4737
4738    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4739    src0_ptr += (3 * src_stride);
4740    XORI_B3_128_SB(src0, src1, src2);
4741
4742    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4743    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4744    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4745    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4746    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4747    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4748
4749    for (loop_cnt = height >> 3; loop_cnt--;) {
4750        LD_SB8(src0_ptr, src_stride,
4751               src3, src4, src5, src6, src7, src8, src9, src10);
4752        src0_ptr += (8 * src_stride);
4753        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4754        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4755        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4756        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4757        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4758
4759        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4760        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4761        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4762        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4763
4764        dst32_r = __msa_ilvr_h(dst73, dst22);
4765        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4766        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4767        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4768        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4769        dst76_r = __msa_ilvr_h(dst22, dst106);
4770
4771        LD2(src1_ptr, src2_stride, tp0, tp1);
4772        src1_ptr += 2 * src2_stride;
4773        INSERT_D2_SH(tp0, tp1, in0);
4774        LD2(src1_ptr, src2_stride, tp0, tp1);
4775        src1_ptr += 2 * src2_stride;
4776        INSERT_D2_SH(tp0, tp1, in1);
4777
4778        LD2(src1_ptr, src2_stride, tp0, tp1);
4779        src1_ptr += 2 * src2_stride;
4780        INSERT_D2_SH(tp0, tp1, in2);
4781        LD2(src1_ptr, src2_stride, tp0, tp1);
4782        src1_ptr += 2 * src2_stride;
4783        INSERT_D2_SH(tp0, tp1, in3);
4784
4785        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4786        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4787        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4788        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4789        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4790        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4791        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4792        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4793        SRA_4V(dst0, dst1, dst2, dst3, 6);
4794        SRA_4V(dst4, dst5, dst6, dst7, 6);
4795        PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4796                    dst2, dst3);
4797        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4798        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4799        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4800        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4801        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4802        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4803        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4804        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4805        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4806        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4807        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4808        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4809        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4810        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4811        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4812                    tmp2, tmp3);
4813        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4814        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4815        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4816        dst += (8 * dst_stride);
4817
4818        dst10_r = dst98_r;
4819        dst21_r = dst109_r;
4820        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4821    }
4822}
4823
4824static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
4825                                    int32_t src_stride,
4826                                    int16_t *src1_ptr,
4827                                    int32_t src2_stride,
4828                                    uint8_t *dst,
4829                                    int32_t dst_stride,
4830                                    const int8_t *filter_x,
4831                                    const int8_t *filter_y,
4832                                    int32_t height,
4833                                    int32_t weight0,
4834                                    int32_t weight1,
4835                                    int32_t offset0,
4836                                    int32_t offset1,
4837                                    int32_t rnd_val)
4838{
4839    if (2 == height) {
4840        hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4841                                 dst, dst_stride, filter_x, filter_y,
4842                                 weight0, weight1, offset0, offset1, rnd_val);
4843    } else if (4 == height) {
4844        hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4845                                 dst, dst_stride, filter_x, filter_y,
4846                                 weight0, weight1, offset0, offset1, rnd_val);
4847    } else if (0 == (height % 8)) {
4848        hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4849                                         src1_ptr, src2_stride,
4850                                         dst, dst_stride, filter_x, filter_y,
4851                                         height, weight0, weight1,
4852                                         offset0, offset1, rnd_val);
4853    }
4854}
4855
4856static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
4857                                    int32_t src_stride,
4858                                    int16_t *src1_ptr,
4859                                    int32_t src2_stride,
4860                                    uint8_t *dst,
4861                                    int32_t dst_stride,
4862                                    const int8_t *filter_x,
4863                                    const int8_t *filter_y,
4864                                    int32_t height,
4865                                    int32_t weight0,
4866                                    int32_t weight1,
4867                                    int32_t offset0,
4868                                    int32_t offset1,
4869                                    int32_t rnd_val)
4870{
4871    uint32_t tpw0, tpw1, tpw2, tpw3;
4872    uint64_t tp0, tp1;
4873    int32_t offset, weight;
4874    v16u8 out0, out1, out2;
4875    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4876    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4877    v8i16 in4 = { 0 }, in5 = { 0 };
4878    v8i16 filt0, filt1;
4879    v8i16 filt_h0, filt_h1, filter_vec;
4880    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4881    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4882    v16i8 mask1;
4883    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4884    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4885    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4886    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4887    v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4888    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4889    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4890    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4891    v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4892    v4i32 offset_vec, rnd_vec, const_vec;
4893
4894    src0_ptr -= (src_stride + 1);
4895
4896    filter_vec = LD_SH(filter_x);
4897    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4898
4899    filter_vec = LD_SH(filter_y);
4900    UNPCK_R_SB_SH(filter_vec, filter_vec);
4901
4902    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4903
4904    mask1 = mask0 + 2;
4905
4906    offset = (offset0 + offset1) << rnd_val;
4907    weight0 = weight0 & 0x0000FFFF;
4908    weight = weight0 | (weight1 << 16);
4909
4910    const_vec = __msa_fill_w((128 * weight1));
4911    const_vec <<= 6;
4912    offset_vec = __msa_fill_w(offset);
4913    weight_vec = (v8i16) __msa_fill_w(weight);
4914    rnd_vec = __msa_fill_w(rnd_val + 1);
4915    offset_vec += const_vec;
4916
4917    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4918    src0_ptr += (3 * src_stride);
4919    XORI_B3_128_SB(src0, src1, src2);
4920
4921    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4922    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4923    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4924    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4925    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4926    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4927
4928    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4929    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4930
4931    LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4932           src10);
4933    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4934
4935    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4936    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4937    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4938    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4939
4940    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4941    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4942    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4943    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4944
4945    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4946    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4947    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4948    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4949
4950    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4951    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4952    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4953    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4954
4955    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4956    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4957    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4958    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4959    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4960    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4961    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4962    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4963    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4964    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4965    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4966
4967    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4968    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4969    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4970    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4971    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4972    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4973    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4974    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4975    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4976    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4977    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4978    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4979    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4980    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4981    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4982    PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4983    PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4984
4985    LD2(src1_ptr, src2_stride, tp0, tp1);
4986    INSERT_D2_SH(tp0, tp1, in0);
4987    LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4988    INSERT_D2_SH(tp0, tp1, in1);
4989
4990    LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4991    INSERT_D2_SH(tp0, tp1, in2);
4992    LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4993    INSERT_D2_SH(tp0, tp1, in3);
4994
4995    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4996    ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4997    ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4998    ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4999    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5000    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5001    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5002    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5003    dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5004    dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5005    dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5006    dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5007    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5008    SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5009    PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5010                tmp2, tmp3);
5011    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5012    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5013    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5014
5015    PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5016
5017    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5018    src1_ptr += (4 * src2_stride);
5019    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5020    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5021    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5022
5023    ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5024    ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5025
5026    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5027    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5028    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5029    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5030    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5031    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5032
5033    CLIP_SH2_0_255(tmp4, tmp5);
5034    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5035    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5036}
5037
5038static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
5039                                     int32_t src_stride,
5040                                     int16_t *src1_ptr,
5041                                     int32_t src2_stride,
5042                                     uint8_t *dst,
5043                                     int32_t dst_stride,
5044                                     const int8_t *filter_x,
5045                                     const int8_t *filter_y,
5046                                     int32_t weight0,
5047                                     int32_t weight1,
5048                                     int32_t offset0,
5049                                     int32_t offset1,
5050                                     int32_t rnd_val)
5051{
5052    int32_t weight, offset;
5053    v16u8 out;
5054    v16i8 src0, src1, src2, src3, src4;
5055    v8i16 filt0, filt1;
5056    v8i16 filt_h0, filt_h1;
5057    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5058    v16i8 mask1;
5059    v8i16 filter_vec, weight_vec;
5060    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5061    v8i16 dst0, dst1, dst2, dst3, dst4;
5062    v8i16 in0, in1;
5063    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5064    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5065    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5066    v8i16 tmp0, tmp1, tmp2, tmp3;
5067    v4i32 offset_vec, rnd_vec, const_vec;
5068
5069    src0_ptr -= (src_stride + 1);
5070
5071    filter_vec = LD_SH(filter_x);
5072    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5073
5074    filter_vec = LD_SH(filter_y);
5075    UNPCK_R_SB_SH(filter_vec, filter_vec);
5076
5077    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5078
5079    mask1 = mask0 + 2;
5080
5081    offset = (offset0 + offset1) << rnd_val;
5082    weight0 = weight0 & 0x0000FFFF;
5083    weight = weight0 | (weight1 << 16);
5084
5085    const_vec = __msa_fill_w((128 * weight1));
5086    const_vec <<= 6;
5087    offset_vec = __msa_fill_w(offset);
5088    weight_vec = (v8i16) __msa_fill_w(weight);
5089    rnd_vec = __msa_fill_w(rnd_val + 1);
5090    offset_vec += const_vec;
5091
5092    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5093    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5094
5095    LD_SH2(src1_ptr, src2_stride, in0, in1);
5096
5097    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5098    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5099    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5100    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5101    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5102
5103    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5104    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5105    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5106    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5107    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5108
5109    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5110    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5111    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5112    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5113    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5114    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5115    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5116    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5117    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5118    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5119
5120    ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5121    ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5122
5123    dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5124    dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5125    dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5126    dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5127    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5128    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5129    CLIP_SH2_0_255(tmp0, tmp1);
5130    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5131    ST_D2(out, 0, 1, dst, dst_stride);
5132}
5133
5134static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
5135                                         int32_t src_stride,
5136                                         int16_t *src1_ptr,
5137                                         int32_t src2_stride,
5138                                         uint8_t *dst,
5139                                         int32_t dst_stride,
5140                                         const int8_t *filter_x,
5141                                         const int8_t *filter_y,
5142                                         int32_t weight0,
5143                                         int32_t weight1,
5144                                         int32_t offset0,
5145                                         int32_t offset1,
5146                                         int32_t rnd_val,
5147                                         int32_t width8mult)
5148{
5149    int32_t weight, offset;
5150    uint32_t cnt;
5151    v16u8 out0, out1;
5152    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5153    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5154    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5155    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5156    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5157    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5158    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5159    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5160    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5161    v4i32 offset_vec, rnd_vec, const_vec;
5162
5163    src0_ptr -= (src_stride + 1);
5164
5165    filter_vec = LD_SH(filter_x);
5166    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5167
5168    filter_vec = LD_SH(filter_y);
5169    UNPCK_R_SB_SH(filter_vec, filter_vec);
5170
5171    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5172
5173    mask0 = LD_SB(ff_hevc_mask_arr);
5174    mask1 = mask0 + 2;
5175
5176    offset = (offset0 + offset1) << rnd_val;
5177    weight0 = weight0 & 0x0000FFFF;
5178    weight = weight0 | (weight1 << 16);
5179
5180    const_vec = __msa_fill_w((128 * weight1));
5181    const_vec <<= 6;
5182    offset_vec = __msa_fill_w(offset);
5183    rnd_vec = __msa_fill_w(rnd_val + 1);
5184    offset_vec += const_vec;
5185    weight_vec = (v8i16) __msa_fill_w(weight);
5186
5187    for (cnt = width8mult; cnt--;) {
5188        LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5189        src0_ptr += 8;
5190        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5191
5192        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5193        src1_ptr += 8;
5194
5195        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5196        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5197        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5198
5199        dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5200        dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5201        dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5202
5203        ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5204        ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5205
5206        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5207        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5208        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5209        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5210
5211        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5212        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5213        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5214        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5215
5216        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5217        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5218        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5219        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5220
5221        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5222        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5223        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5224        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5225        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5226        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5227        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5228        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5229
5230        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5231        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5232        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5233                    dst3_r, dst0, dst1, dst2, dst3);
5234
5235        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5236        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5237        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5238        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5239        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5240        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5241        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5242        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5243        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5244        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5245        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5246        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5247        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5248        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5249        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5250                    tmp0, tmp1, tmp2, tmp3);
5251        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5252        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5253        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5254        dst += 8;
5255    }
5256}
5257
5258static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
5259                                     int32_t src_stride,
5260                                     int16_t *src1_ptr,
5261                                     int32_t src2_stride,
5262                                     uint8_t *dst,
5263                                     int32_t dst_stride,
5264                                     const int8_t *filter_x,
5265                                     const int8_t *filter_y,
5266                                     int32_t weight0,
5267                                     int32_t weight1,
5268                                     int32_t offset0,
5269                                     int32_t offset1,
5270                                     int32_t rnd_val)
5271{
5272    uint32_t offset, weight;
5273    v16u8 out0, out1, out2;
5274    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5275    v8i16 filt0, filt1;
5276    v8i16 filt_h0, filt_h1;
5277    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5278    v16i8 mask1;
5279    v8i16 filter_vec, weight_vec;
5280    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5281    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5282    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5283    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5284    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5285    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5286    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5287    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5288    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5289    v8i16 in0, in1, in2, in3, in4, in5;
5290    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5291    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5292    v4i32 offset_vec, rnd_vec, const_vec;
5293
5294    src0_ptr -= (src_stride + 1);
5295
5296    filter_vec = LD_SH(filter_x);
5297    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5298
5299    filter_vec = LD_SH(filter_y);
5300    UNPCK_R_SB_SH(filter_vec, filter_vec);
5301
5302    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5303
5304    mask1 = mask0 + 2;
5305
5306    offset = (offset0 + offset1) << rnd_val;
5307    weight0 = weight0 & 0x0000FFFF;
5308    weight = weight0 | (weight1 << 16);
5309
5310    const_vec = __msa_fill_w((128 * weight1));
5311    const_vec <<= 6;
5312    offset_vec = __msa_fill_w(offset);
5313    weight_vec = (v8i16) __msa_fill_w(weight);
5314    rnd_vec = __msa_fill_w(rnd_val + 1);
5315    offset_vec += const_vec;
5316
5317    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5318    src0_ptr += (5 * src_stride);
5319    LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5320
5321    XORI_B5_128_SB(src0, src1, src2, src3, src4);
5322    XORI_B4_128_SB(src5, src6, src7, src8);
5323
5324    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5325
5326    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5327    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5328    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5329    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5330    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5331    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5332    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5333    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5334    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5335
5336    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5337    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5338    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5339    dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5340    dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5341    dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5342    dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5343    dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5344    dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5345
5346    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5347    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5348    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5349    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5350    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5351    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5352    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5353    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5354
5355    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5356    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5357    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5358    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5359    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5360    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5361    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5362    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5363    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5364    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5365    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5366    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5367
5368    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5369    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5370    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5371    PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5372                dst0, dst1, dst2, dst3);
5373
5374    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5375    ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5376    ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5377    ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5378    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5379    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5380    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5381    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5382    dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5383    dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5384    dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5385    dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5386    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5387    SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5388    PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5389                tmp0, tmp1, tmp2, tmp3);
5390    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5391    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5392
5393    PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5394    ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5395    ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5396    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5397    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5398    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5399    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5400    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5401    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5402    CLIP_SH2_0_255(tmp4, tmp5);
5403    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5404    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5405    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5406}
5407
5408static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
5409                                             int32_t src_stride,
5410                                             int16_t *src1_ptr,
5411                                             int32_t src2_stride,
5412                                             uint8_t *dst,
5413                                             int32_t dst_stride,
5414                                             const int8_t *filter_x,
5415                                             const int8_t *filter_y,
5416                                             int32_t height,
5417                                             int32_t weight0,
5418                                             int32_t weight1,
5419                                             int32_t offset0,
5420                                             int32_t offset1,
5421                                             int32_t rnd_val,
5422                                             int32_t width)
5423{
5424    uint32_t loop_cnt;
5425    uint32_t cnt;
5426    int32_t offset, weight;
5427    uint8_t *src0_ptr_tmp;
5428    int16_t *src1_ptr_tmp;
5429    uint8_t *dst_tmp;
5430    v16u8 out0, out1;
5431    v16i8 src0, src1, src2, src3, src4, src5, src6;
5432    v8i16 in0, in1, in2, in3;
5433    v8i16 filt0, filt1;
5434    v8i16 filt_h0, filt_h1;
5435    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5436    v16i8 mask1;
5437    v8i16 filter_vec;
5438    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5439    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5440    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5441    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5442    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5443    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5444    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5445    v4i32 offset_vec, rnd_vec, const_vec;
5446
5447    src0_ptr -= (src_stride + 1);
5448
5449    filter_vec = LD_SH(filter_x);
5450    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5451
5452    filter_vec = LD_SH(filter_y);
5453    UNPCK_R_SB_SH(filter_vec, filter_vec);
5454
5455    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5456
5457    mask1 = mask0 + 2;
5458
5459    offset = (offset0 + offset1) << rnd_val;
5460    weight0 = weight0 & 0x0000FFFF;
5461    weight = weight0 | (weight1 << 16);
5462
5463    const_vec = __msa_fill_w((128 * weight1));
5464    const_vec <<= 6;
5465    offset_vec = __msa_fill_w(offset);
5466    weight_vec = (v8i16) __msa_fill_w(weight);
5467    rnd_vec = __msa_fill_w(rnd_val + 1);
5468    offset_vec += const_vec;
5469
5470    for (cnt = width >> 3; cnt--;) {
5471        src0_ptr_tmp = src0_ptr;
5472        src1_ptr_tmp = src1_ptr;
5473        dst_tmp = dst;
5474
5475        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5476        src0_ptr_tmp += (3 * src_stride);
5477        XORI_B3_128_SB(src0, src1, src2);
5478
5479        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5480        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5481        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5482        dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5483        dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5484        dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5485
5486        ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5487        ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5488
5489        for (loop_cnt = height >> 2; loop_cnt--;) {
5490            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5491            src0_ptr_tmp += (4 * src_stride);
5492            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5493            src1_ptr_tmp += (4 * src2_stride);
5494            XORI_B4_128_SB(src3, src4, src5, src6);
5495
5496            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5497            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5498            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5499            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5500
5501            dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5502            dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5503            dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5504            dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5505
5506            ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5507            ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5508            ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5509            ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5510
5511            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5512            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5513            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5514            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5515            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5516            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5517            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5518            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5519
5520            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5521            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5522            PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5523                        dst3_r, dst0, dst1, dst2, dst3);
5524            ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5525            ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5526            ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5527            ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5528            dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5529            dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5530            dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5531            dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5532            dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5533            dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5534            dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5535            dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5536            SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5537            SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5538            PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5539                        tmp0, tmp1, tmp2, tmp3);
5540            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5541            PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5542            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5543            dst_tmp += (4 * dst_stride);
5544
5545            dst10_r = dst54_r;
5546            dst10_l = dst54_l;
5547            dst21_r = dst65_r;
5548            dst21_l = dst65_l;
5549            dsth2 = dsth6;
5550        }
5551
5552        src0_ptr += 8;
5553        dst += 8;
5554        src1_ptr += 8;
5555    }
5556}
5557
5558static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
5559                                    int32_t src_stride,
5560                                    int16_t *src1_ptr,
5561                                    int32_t src2_stride,
5562                                    uint8_t *dst,
5563                                    int32_t dst_stride,
5564                                    const int8_t *filter_x,
5565                                    const int8_t *filter_y,
5566                                    int32_t height,
5567                                    int32_t weight0,
5568                                    int32_t weight1,
5569                                    int32_t offset0,
5570                                    int32_t offset1,
5571                                    int32_t rnd_val)
5572{
5573    if (2 == height) {
5574        hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5575                                 dst, dst_stride, filter_x, filter_y,
5576                                 weight0, weight1, offset0, offset1, rnd_val);
5577    } else if (4 == height) {
5578        hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5579                                     src2_stride, dst, dst_stride, filter_x,
5580                                     filter_y, weight0, weight1, offset0,
5581                                     offset1, rnd_val, 1);
5582    } else if (6 == height) {
5583        hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5584                                 dst, dst_stride, filter_x, filter_y,
5585                                 weight0, weight1, offset0, offset1, rnd_val);
5586    } else if (0 == (height % 4)) {
5587        hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5588                                         src1_ptr, src2_stride,
5589                                         dst, dst_stride, filter_x, filter_y,
5590                                         height, weight0,
5591                                         weight1, offset0, offset1, rnd_val, 8);
5592    }
5593}
5594
5595static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
5596                                     int32_t src_stride,
5597                                     int16_t *src1_ptr,
5598                                     int32_t src2_stride,
5599                                     uint8_t *dst,
5600                                     int32_t dst_stride,
5601                                     const int8_t *filter_x,
5602                                     const int8_t *filter_y,
5603                                     int32_t height,
5604                                     int32_t weight0,
5605                                     int32_t weight1,
5606                                     int32_t offset0,
5607                                     int32_t offset1,
5608                                     int32_t rnd_val)
5609{
5610    uint32_t loop_cnt;
5611    uint64_t tp0, tp1;
5612    int32_t offset, weight;
5613    uint8_t *src0_ptr_tmp, *dst_tmp;
5614    int16_t *src1_ptr_tmp;
5615    v16u8 out0, out1;
5616    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5617    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5618    v16i8 mask0, mask1, mask2, mask3;
5619    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5620    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5621    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5622    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5623    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5624    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5625    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5626    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5627    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5628    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5629    v4i32 offset_vec, rnd_vec, const_vec;
5630
5631    src0_ptr -= (src_stride + 1);
5632
5633    filter_vec = LD_SH(filter_x);
5634    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5635
5636    filter_vec = LD_SH(filter_y);
5637    UNPCK_R_SB_SH(filter_vec, filter_vec);
5638
5639    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5640
5641    mask0 = LD_SB(ff_hevc_mask_arr);
5642    mask1 = mask0 + 2;
5643
5644    offset = (offset0 + offset1) << rnd_val;
5645    weight0 = weight0 & 0x0000FFFF;
5646    weight = weight0 | (weight1 << 16);
5647
5648    const_vec = __msa_fill_w((128 * weight1));
5649    const_vec <<= 6;
5650    offset_vec = __msa_fill_w(offset);
5651    rnd_vec = __msa_fill_w(rnd_val + 1);
5652    offset_vec += const_vec;
5653    weight_vec = (v8i16) __msa_fill_w(weight);
5654
5655    src0_ptr_tmp = src0_ptr;
5656    dst_tmp = dst;
5657    src1_ptr_tmp = src1_ptr;
5658
5659    LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5660    src0_ptr_tmp += (3 * src_stride);
5661
5662    XORI_B3_128_SB(src0, src1, src2);
5663
5664    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5665    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5666    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5667
5668    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5669    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5670    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5671
5672    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5673    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5674
5675    for (loop_cnt = 4; loop_cnt--;) {
5676        LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5677        src0_ptr_tmp += (4 * src_stride);
5678        XORI_B4_128_SB(src3, src4, src5, src6);
5679
5680        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5681        src1_ptr_tmp += (4 * src2_stride);
5682
5683        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5684        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5685        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5686        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5687
5688        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5689        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5690        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5691        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5692
5693        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5694        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5695        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5696        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5697
5698        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5699        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5700        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5701        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5702        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5703        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5704        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5705        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5706
5707        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5708        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5709        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5710                    dst3_r, dst0, dst1, dst2, dst3);
5711        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5712        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5713        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5714        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5715        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5716        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5717        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5718        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5719        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5720        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5721        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5722        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5723        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5724        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5725        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5726                    tmp0, tmp1, tmp2, tmp3);
5727        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5728        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5729        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5730        dst_tmp += (4 * dst_stride);
5731
5732        dst10_r = dst54_r;
5733        dst10_l = dst54_l;
5734        dst21_r = dst65_r;
5735        dst21_l = dst65_l;
5736        dsth2 = dsth6;
5737    }
5738
5739    src0_ptr += 8;
5740    dst += 8;
5741    src1_ptr += 8;
5742
5743    mask2 = LD_SB(ff_hevc_mask_arr + 16);
5744    mask3 = mask2 + 2;
5745
5746    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5747    src0_ptr += (3 * src_stride);
5748    XORI_B3_128_SB(src0, src1, src2);
5749    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5750    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5751
5752    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5753    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5754
5755    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5756    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5757
5758    for (loop_cnt = 2; loop_cnt--;) {
5759        LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5760               src10);
5761        src0_ptr += (8 * src_stride);
5762        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5763        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5764        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5765        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5766        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5767
5768        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5769        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5770        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5771        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5772
5773        dst32_r = __msa_ilvr_h(dst73, dst22);
5774        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5775        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5776        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5777        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5778        dst76_r = __msa_ilvr_h(dst22, dst106);
5779
5780        LD2(src1_ptr, src2_stride, tp0, tp1);
5781        src1_ptr += 2 * src2_stride;
5782        INSERT_D2_SH(tp0, tp1, in0);
5783        LD2(src1_ptr, src2_stride, tp0, tp1);
5784        src1_ptr += 2 * src2_stride;
5785        INSERT_D2_SH(tp0, tp1, in1);
5786
5787        LD2(src1_ptr, src2_stride, tp0, tp1);
5788        src1_ptr += 2 * src2_stride;
5789        INSERT_D2_SH(tp0, tp1, in2);
5790        LD2(src1_ptr, src2_stride, tp0, tp1);
5791        src1_ptr += 2 * src2_stride;
5792        INSERT_D2_SH(tp0, tp1, in3);
5793
5794        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5795        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5796        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5797        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5798        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5799        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5800        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5801        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5802
5803        SRA_4V(dst0, dst1, dst2, dst3, 6);
5804        SRA_4V(dst4, dst5, dst6, dst7, 6);
5805        PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5806                    dst0, dst1, dst2, dst3);
5807        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5808        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5809        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5810        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5811        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5812        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5813        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5814        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5815        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5816        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5817        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5818        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5819        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5820        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5821        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5822                    tmp0, tmp1, tmp2, tmp3);
5823        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5824        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5825        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5826        dst += (8 * dst_stride);
5827
5828        dst10_r = dst98_r;
5829        dst21_r = dst109_r;
5830        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5831    }
5832}
5833
5834static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
5835                                     int32_t src_stride,
5836                                     int16_t *src1_ptr,
5837                                     int32_t src2_stride,
5838                                     uint8_t *dst,
5839                                     int32_t dst_stride,
5840                                     const int8_t *filter_x,
5841                                     const int8_t *filter_y,
5842                                     int32_t height,
5843                                     int32_t weight0,
5844                                     int32_t weight1,
5845                                     int32_t offset0,
5846                                     int32_t offset1,
5847                                     int32_t rnd_val)
5848{
5849    if (4 == height) {
5850        hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5851                                     src2_stride, dst, dst_stride, filter_x,
5852                                     filter_y, weight0, weight1, offset0,
5853                                     offset1, rnd_val, 2);
5854    } else {
5855        hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5856                                         src2_stride, dst, dst_stride,
5857                                         filter_x, filter_y, height, weight0,
5858                                         weight1, offset0, offset1, rnd_val, 16);
5859    }
5860}
5861
5862static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
5863                                     int32_t src_stride,
5864                                     int16_t *src1_ptr,
5865                                     int32_t src2_stride,
5866                                     uint8_t *dst,
5867                                     int32_t dst_stride,
5868                                     const int8_t *filter_x,
5869                                     const int8_t *filter_y,
5870                                     int32_t height,
5871                                     int32_t weight0,
5872                                     int32_t weight1,
5873                                     int32_t offset0,
5874                                     int32_t offset1,
5875                                     int32_t rnd_val)
5876{
5877    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5878                                     src1_ptr, src2_stride,
5879                                     dst, dst_stride,
5880                                     filter_x, filter_y, height, weight0,
5881                                     weight1, offset0, offset1, rnd_val, 24);
5882}
5883
5884static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
5885                                     int32_t src_stride,
5886                                     int16_t *src1_ptr,
5887                                     int32_t src2_stride,
5888                                     uint8_t *dst,
5889                                     int32_t dst_stride,
5890                                     const int8_t *filter_x,
5891                                     const int8_t *filter_y,
5892                                     int32_t height,
5893                                     int32_t weight0,
5894                                     int32_t weight1,
5895                                     int32_t offset0,
5896                                     int32_t offset1,
5897                                     int32_t rnd_val)
5898{
5899    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5900                                     src1_ptr, src2_stride,
5901                                     dst, dst_stride,
5902                                     filter_x, filter_y, height, weight0,
5903                                     weight1, offset0, offset1, rnd_val, 32);
5904}
5905
5906#define BI_W_MC_COPY(WIDTH)                                                  \
5907void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,           \
5908                                                     ptrdiff_t dst_stride,   \
5909                                                     uint8_t *src,           \
5910                                                     ptrdiff_t src_stride,   \
5911                                                     int16_t *src_16bit,     \
5912                                                     int height,             \
5913                                                     int denom,              \
5914                                                     int weight0,            \
5915                                                     int weight1,            \
5916                                                     int offset0,            \
5917                                                     int offset1,            \
5918                                                     intptr_t mx,            \
5919                                                     intptr_t my,            \
5920                                                     int width)              \
5921{                                                                            \
5922    int shift = 14 + 1 - 8;                                                  \
5923    int log2Wd = denom + shift - 1;                                          \
5924                                                                             \
5925    hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
5926                                   dst, dst_stride, height,                  \
5927                                   weight0, weight1, offset0,                \
5928                                   offset1, log2Wd);                         \
5929}
5930
5931BI_W_MC_COPY(4);
5932BI_W_MC_COPY(6);
5933BI_W_MC_COPY(8);
5934BI_W_MC_COPY(12);
5935BI_W_MC_COPY(16);
5936BI_W_MC_COPY(24);
5937BI_W_MC_COPY(32);
5938BI_W_MC_COPY(48);
5939BI_W_MC_COPY(64);
5940
5941#undef BI_W_MC_COPY
5942
5943#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                         \
5944void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,         \
5945                                                        ptrdiff_t             \
5946                                                        dst_stride,           \
5947                                                        uint8_t *src,         \
5948                                                        ptrdiff_t             \
5949                                                        src_stride,           \
5950                                                        int16_t *src_16bit,   \
5951                                                        int height,           \
5952                                                        int denom,            \
5953                                                        int weight0,          \
5954                                                        int weight1,          \
5955                                                        int offset0,          \
5956                                                        int offset1,          \
5957                                                        intptr_t mx,          \
5958                                                        intptr_t my,          \
5959                                                        int width)            \
5960{                                                                             \
5961    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \
5962    int log2Wd = denom + 14 - 8;                                              \
5963                                                                              \
5964    hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,   \
5965                                                MAX_PB_SIZE, dst, dst_stride, \
5966                                                filter, height, weight0,      \
5967                                                weight1, offset0, offset1,    \
5968                                                log2Wd);                      \
5969}
5970
5971BI_W_MC(qpel, h, 4, 8, hz, mx);
5972BI_W_MC(qpel, h, 8, 8, hz, mx);
5973BI_W_MC(qpel, h, 12, 8, hz, mx);
5974BI_W_MC(qpel, h, 16, 8, hz, mx);
5975BI_W_MC(qpel, h, 24, 8, hz, mx);
5976BI_W_MC(qpel, h, 32, 8, hz, mx);
5977BI_W_MC(qpel, h, 48, 8, hz, mx);
5978BI_W_MC(qpel, h, 64, 8, hz, mx);
5979
5980BI_W_MC(qpel, v, 4, 8, vt, my);
5981BI_W_MC(qpel, v, 8, 8, vt, my);
5982BI_W_MC(qpel, v, 12, 8, vt, my);
5983BI_W_MC(qpel, v, 16, 8, vt, my);
5984BI_W_MC(qpel, v, 24, 8, vt, my);
5985BI_W_MC(qpel, v, 32, 8, vt, my);
5986BI_W_MC(qpel, v, 48, 8, vt, my);
5987BI_W_MC(qpel, v, 64, 8, vt, my);
5988
5989BI_W_MC(epel, h, 4, 4, hz, mx);
5990BI_W_MC(epel, h, 8, 4, hz, mx);
5991BI_W_MC(epel, h, 6, 4, hz, mx);
5992BI_W_MC(epel, h, 12, 4, hz, mx);
5993BI_W_MC(epel, h, 16, 4, hz, mx);
5994BI_W_MC(epel, h, 24, 4, hz, mx);
5995BI_W_MC(epel, h, 32, 4, hz, mx);
5996
5997BI_W_MC(epel, v, 4, 4, vt, my);
5998BI_W_MC(epel, v, 8, 4, vt, my);
5999BI_W_MC(epel, v, 6, 4, vt, my);
6000BI_W_MC(epel, v, 12, 4, vt, my);
6001BI_W_MC(epel, v, 16, 4, vt, my);
6002BI_W_MC(epel, v, 24, 4, vt, my);
6003BI_W_MC(epel, v, 32, 4, vt, my);
6004
6005#undef BI_W_MC
6006
6007#define BI_W_MC_HV(PEL, WIDTH, TAP)                                         \
6008void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \
6009                                                     ptrdiff_t dst_stride,  \
6010                                                     uint8_t *src,          \
6011                                                     ptrdiff_t src_stride,  \
6012                                                     int16_t *src_16bit,    \
6013                                                     int height,            \
6014                                                     int denom,             \
6015                                                     int weight0,           \
6016                                                     int weight1,           \
6017                                                     int offset0,           \
6018                                                     int offset1,           \
6019                                                     intptr_t mx,           \
6020                                                     intptr_t my,           \
6021                                                     int width)             \
6022{                                                                           \
6023    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];               \
6024    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];               \
6025    int log2Wd = denom + 14 - 8;                                            \
6026                                                                            \
6027    hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
6028                                          MAX_PB_SIZE, dst, dst_stride,     \
6029                                          filter_x, filter_y, height,       \
6030                                          weight0, weight1, offset0,        \
6031                                          offset1, log2Wd);                 \
6032}
6033
6034BI_W_MC_HV(qpel, 4, 8);
6035BI_W_MC_HV(qpel, 8, 8);
6036BI_W_MC_HV(qpel, 12, 8);
6037BI_W_MC_HV(qpel, 16, 8);
6038BI_W_MC_HV(qpel, 24, 8);
6039BI_W_MC_HV(qpel, 32, 8);
6040BI_W_MC_HV(qpel, 48, 8);
6041BI_W_MC_HV(qpel, 64, 8);
6042
6043BI_W_MC_HV(epel, 4, 4);
6044BI_W_MC_HV(epel, 8, 4);
6045BI_W_MC_HV(epel, 6, 4);
6046BI_W_MC_HV(epel, 12, 4);
6047BI_W_MC_HV(epel, 16, 4);
6048BI_W_MC_HV(epel, 24, 4);
6049BI_W_MC_HV(epel, 32, 4);
6050
6051#undef BI_W_MC_HV
6052