1/*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "libavcodec/mips/hevcdsp_mips.h"
23#include "libavcodec/mips/hevc_macros_msa.h"
24
25static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26    /* 8 width cases */
27    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29};
30
31#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)  \
32{                                                                     \
33    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                    \
34    SRARI_H2_SH(out0, out1, rnd_val);                                 \
35    CLIP_SH2_0_255(out0, out1);                                       \
36}
37
38#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,                      \
39                          vec0, vec1, vec2, vec3, rnd_val,         \
40                          out0, out1, out2, out3)                  \
41{                                                                  \
42    HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
43    HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
44}
45
46#define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val,  \
47                                   out0, out1)                     \
48{                                                                  \
49    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                 \
50    SRARI_H2_SH(out0, out1, rnd_val);                              \
51    CLIP_SH2_0_255(out0, out1);                                    \
52}
53
54#define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,    \
55                                   vec3,  rnd_val, out0, out1, out2, out3)  \
56{                                                                           \
57    HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
58    HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
59}
60
61static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
62                                int32_t src_stride,
63                                int16_t *src1_ptr,
64                                int32_t src2_stride,
65                                uint8_t *dst,
66                                int32_t dst_stride,
67                                int32_t height)
68{
69    uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70    uint64_t tpd0, tpd1, tpd2, tpd3;
71    v16i8 src0 = { 0 }, src1 = { 0 };
72    v16i8 zero = { 0 };
73    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74    v8i16 dst0, dst1, dst2, dst3;
75
76    if (2 == height) {
77        LW2(src0_ptr, src_stride, tp0, tp1);
78        INSERT_W2_SB(tp0, tp1, src0);
79        LD2(src1_ptr, src2_stride, tpd0, tpd1);
80        INSERT_D2_SH(tpd0, tpd1, in0);
81
82        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
83        dst0 <<= 6;
84        dst0 += in0;
85        dst0 = __msa_srari_h(dst0, 7);
86        CLIP_SH_0_255(dst0);
87
88        dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89        ST_W2(dst0, 0, 1, dst, dst_stride);
90    } else if (4 == height) {
91        LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
92        INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
93        LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
94        INSERT_D2_SH(tpd0, tpd1, in0);
95        INSERT_D2_SH(tpd2, tpd3, in1);
96        ILVRL_B2_SH(zero, src0, dst0, dst1);
97        SLLI_2V(dst0, dst1, 6);
98        HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
99        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100        ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
101    } else if (0 == height % 8) {
102        for (loop_cnt = (height >> 3); loop_cnt--;) {
103            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104            src0_ptr += 4 * src_stride;
105            INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
106            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107            src0_ptr += 4 * src_stride;
108            INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
109            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110            src1_ptr += (4 * src2_stride);
111            INSERT_D2_SH(tpd0, tpd1, in0);
112            INSERT_D2_SH(tpd2, tpd3, in1);
113            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114            src1_ptr += (4 * src2_stride);
115            INSERT_D2_SH(tpd0, tpd1, in2);
116            INSERT_D2_SH(tpd2, tpd3, in3);
117            ILVRL_B2_SH(zero, src0, dst0, dst1);
118            ILVRL_B2_SH(zero, src1, dst2, dst3);
119            SLLI_4V(dst0, dst1, dst2, dst3, 6);
120            HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
121                                       dst3, 7, dst0, dst1, dst2, dst3);
122            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
123            ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
124            dst += (8 * dst_stride);
125        }
126    }
127}
128
129static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
130                                int32_t src_stride,
131                                int16_t *src1_ptr,
132                                int32_t src2_stride,
133                                uint8_t *dst,
134                                int32_t dst_stride,
135                                int32_t height)
136{
137    uint32_t loop_cnt;
138    uint64_t tp0, tp1, tp2, tp3;
139    v16u8 out0, out1, out2, out3;
140    v16i8 zero = { 0 };
141    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
142    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
144
145    for (loop_cnt = (height >> 3); loop_cnt--;) {
146        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
147        src0_ptr += (4 * src_stride);
148        INSERT_D2_SB(tp0, tp1, src0);
149        INSERT_D2_SB(tp2, tp3, src1);
150        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
151        src0_ptr += (4 * src_stride);
152        INSERT_D2_SB(tp0, tp1, src2);
153        INSERT_D2_SB(tp2, tp3, src3);
154        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
155        src1_ptr += (8 * src2_stride);
156        ILVRL_B2_SH(zero, src0, dst0, dst1);
157        ILVRL_B2_SH(zero, src1, dst2, dst3);
158        ILVRL_B2_SH(zero, src2, dst4, dst5);
159        ILVRL_B2_SH(zero, src3, dst6, dst7);
160        SLLI_4V(dst0, dst1, dst2, dst3, 6);
161        SLLI_4V(dst4, dst5, dst6, dst7, 6);
162        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
163                                   7, dst0, dst1, dst2, dst3);
164        HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
165                                   7, dst4, dst5, dst6, dst7);
166        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
167        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
168        ST_W2(out0, 0, 2, dst, dst_stride);
169        ST_H2(out0, 2, 6, dst + 4, dst_stride);
170        ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
171        ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
172        dst += (4 * dst_stride);
173        ST_W2(out2, 0, 2, dst, dst_stride);
174        ST_H2(out2, 2, 6, dst + 4, dst_stride);
175        ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
176        ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177        dst += (4 * dst_stride);
178    }
179}
180
181static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
182                                int32_t src_stride,
183                                int16_t *src1_ptr,
184                                int32_t src2_stride,
185                                uint8_t *dst,
186                                int32_t dst_stride,
187                                int32_t height)
188{
189    uint64_t tp0, tp1, tp2, tp3;
190    v16u8 out0, out1, out2, out3;
191    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
192    v16i8 zero = { 0 };
193    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
194    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
195
196    if (2 == height) {
197        LD2(src0_ptr, src_stride, tp0, tp1);
198        INSERT_D2_SB(tp0, tp1, src0);
199        LD_SH2(src1_ptr, src2_stride, in0, in1);
200        ILVRL_B2_SH(zero, src0, dst0, dst1);
201        SLLI_2V(dst0, dst1, 6);
202        HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
203        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
204        ST_D2(out0, 0, 1, dst, dst_stride);
205    } else if (4 == height) {
206        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
207        INSERT_D2_SB(tp0, tp1, src0);
208        INSERT_D2_SB(tp2, tp3, src1);
209        ILVRL_B2_SH(zero, src0, dst0, dst1);
210        ILVRL_B2_SH(zero, src1, dst2, dst3);
211        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
212        SLLI_4V(dst0, dst1, dst2, dst3, 6);
213        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
214                                   7, dst0, dst1, dst2, dst3);
215        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
216        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
217    } else if (6 == height) {
218        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
219        src0_ptr += 4 * src_stride;
220        INSERT_D2_SB(tp0, tp1, src0);
221        INSERT_D2_SB(tp2, tp3, src1);
222        LD2(src0_ptr, src_stride, tp0, tp1);
223        INSERT_D2_SB(tp0, tp1, src2);
224        ILVRL_B2_SH(zero, src0, dst0, dst1);
225        ILVRL_B2_SH(zero, src1, dst2, dst3);
226        ILVRL_B2_SH(zero, src2, dst4, dst5);
227        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
228        SLLI_4V(dst0, dst1, dst2, dst3, 6);
229        SLLI_2V(dst4, dst5, 6);
230        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
231                                   7, dst0, dst1, dst2, dst3);
232        HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
233        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
234        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
235        ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
236    } else if (0 == height % 8) {
237        uint32_t loop_cnt;
238
239        for (loop_cnt = (height >> 3); loop_cnt--;) {
240            LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
241            src0_ptr += 4 * src_stride;
242            INSERT_D2_SB(tp0, tp1, src0);
243            INSERT_D2_SB(tp2, tp3, src1);
244            LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
245            src0_ptr += 4 * src_stride;
246            INSERT_D2_SB(tp0, tp1, src2);
247            INSERT_D2_SB(tp2, tp3, src3);
248            ILVRL_B2_SH(zero, src0, dst0, dst1);
249            ILVRL_B2_SH(zero, src1, dst2, dst3);
250            ILVRL_B2_SH(zero, src2, dst4, dst5);
251            ILVRL_B2_SH(zero, src3, dst6, dst7);
252            LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
253                   in7);
254            src1_ptr += (8 * src2_stride);
255            SLLI_4V(dst0, dst1, dst2, dst3, 6);
256            SLLI_4V(dst4, dst5, dst6, dst7, 6);
257            HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
258                                       dst3, 7, dst0, dst1, dst2, dst3);
259            HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
260                                       dst7, 7, dst4, dst5, dst6, dst7);
261            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
262            PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
263            ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
264            dst += (8 * dst_stride);
265        }
266    }
267}
268
269static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
270                                 int32_t src_stride,
271                                 int16_t *src1_ptr,
272                                 int32_t src2_stride,
273                                 uint8_t *dst,
274                                 int32_t dst_stride,
275                                 int32_t height)
276{
277    uint32_t loop_cnt;
278    v16i8 zero = { 0 };
279    v16u8 out0, out1, out2;
280    v16i8 src0, src1, src2, src3;
281    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
282    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
283
284    for (loop_cnt = 4; loop_cnt--;) {
285        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
286        src0_ptr += (4 * src_stride);
287
288        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
289        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
290        src1_ptr += (4 * src2_stride);
291        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
292        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
293                   dst2, dst3);
294        SLLI_4V(dst0, dst1, dst2, dst3, 6);
295        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
296        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
297        SLLI_2V(dst4, dst5, 6);
298        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
299                                   7, dst0, dst1, dst2, dst3);
300        HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
301        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
302        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
303        ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
304        dst += (4 * dst_stride);
305    }
306}
307
308static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
309                                 int32_t src_stride,
310                                 int16_t *src1_ptr,
311                                 int32_t src2_stride,
312                                 uint8_t *dst,
313                                 int32_t dst_stride,
314                                 int32_t height)
315{
316    uint32_t loop_cnt;
317    v16u8 out0, out1, out2, out3;
318    v16i8 src0, src1, src2, src3;
319    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
320    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
321    v16i8 zero = { 0 };
322
323    for (loop_cnt = (height >> 2); loop_cnt--;) {
324        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
325        src0_ptr += (4 * src_stride);
326        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
327        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
328        src1_ptr += (4 * src2_stride);
329        ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
330        ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
331        ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
332        ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
333        SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
334        SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
335        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
336                                   dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
337        HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
338                                   dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
339        PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
340        PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
341        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
342        dst += (4 * dst_stride);
343    }
344}
345
346static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
347                                 int32_t src_stride,
348                                 int16_t *src1_ptr,
349                                 int32_t src2_stride,
350                                 uint8_t *dst,
351                                 int32_t dst_stride,
352                                 int32_t height)
353{
354    uint32_t loop_cnt;
355    v16u8 out0, out1, out2, out3, out4, out5;
356    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
357    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
358    v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
359
360    for (loop_cnt = 8; loop_cnt--;) {
361        LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
362        LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
363        src0_ptr += (4 * src_stride);
364        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
365        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
366        LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
367        src1_ptr += (4 * src2_stride);
368
369        ILVRL_B2_SH(zero, src0, dst0, dst1);
370        ILVRL_B2_SH(zero, src1, dst2, dst3);
371        ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
372        ILVRL_B2_SH(zero, src4, dst6, dst7);
373        ILVRL_B2_SH(zero, src5, dst8, dst9);
374        ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
375        SLLI_4V(dst0, dst1, dst2, dst3, 6);
376        SLLI_4V(dst4, dst5, dst6, dst7, 6);
377        SLLI_4V(dst8, dst9, dst10, dst11, 6);
378        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
379                                   7, dst0, dst1, dst2, dst3);
380        HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
381                                   7, dst4, dst5, dst6, dst7);
382        HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
383                                   dst11, 7, dst8, dst9, dst10, dst11);
384        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
385        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
386        ST_UB4(out0, out1, out3, out4, dst, dst_stride);
387        ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
388        dst += (4 * dst_stride);
389    }
390}
391
392static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
393                                 int32_t src_stride,
394                                 int16_t *src1_ptr,
395                                 int32_t src2_stride,
396                                 uint8_t *dst,
397                                 int32_t dst_stride,
398                                 int32_t height)
399{
400    uint32_t loop_cnt;
401    v16u8 out0, out1, out2, out3;
402    v16i8 src0, src1, src2, src3;
403    v16i8 zero = { 0 };
404    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
405    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
406
407    for (loop_cnt = (height >> 1); loop_cnt--;) {
408        LD_SB2(src0_ptr, 16, src0, src1);
409        src0_ptr += src_stride;
410        LD_SB2(src0_ptr, 16, src2, src3);
411        src0_ptr += src_stride;
412        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
413        src1_ptr += src2_stride;
414        LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
415        src1_ptr += src2_stride;
416
417        ILVRL_B2_SH(zero, src0, dst0, dst1);
418        ILVRL_B2_SH(zero, src1, dst2, dst3);
419        ILVRL_B2_SH(zero, src2, dst4, dst5);
420        ILVRL_B2_SH(zero, src3, dst6, dst7);
421        SLLI_4V(dst0, dst1, dst2, dst3, 6);
422        SLLI_4V(dst4, dst5, dst6, dst7, 6);
423        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
424                                   7, dst0, dst1, dst2, dst3);
425        HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
426                                   7, dst4, dst5, dst6, dst7);
427        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
428        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
429        ST_UB2(out0, out1, dst, 16);
430        dst += dst_stride;
431        ST_UB2(out2, out3, dst, 16);
432        dst += dst_stride;
433    }
434}
435
436static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
437                                 int32_t src_stride,
438                                 int16_t *src1_ptr,
439                                 int32_t src2_stride,
440                                 uint8_t *dst,
441                                 int32_t dst_stride,
442                                 int32_t height)
443{
444    uint32_t loop_cnt;
445    v16u8 out0, out1, out2, out3, out4, out5;
446    v16i8 src0, src1, src2, src3, src4, src5;
447    v16i8 zero = { 0 };
448    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
449    v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
450
451    for (loop_cnt = (height >> 1); loop_cnt--;) {
452        LD_SB3(src0_ptr, 16, src0, src1, src2);
453        src0_ptr += src_stride;
454        LD_SB3(src0_ptr, 16, src3, src4, src5);
455        src0_ptr += src_stride;
456
457        LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
458        src1_ptr += src2_stride;
459        LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
460        src1_ptr += src2_stride;
461
462        ILVRL_B2_SH(zero, src0, dst0, dst1);
463        ILVRL_B2_SH(zero, src1, dst2, dst3);
464        ILVRL_B2_SH(zero, src2, dst4, dst5);
465        ILVRL_B2_SH(zero, src3, dst6, dst7);
466        ILVRL_B2_SH(zero, src4, dst8, dst9);
467        ILVRL_B2_SH(zero, src5, dst10, dst11);
468
469        SLLI_4V(dst0, dst1, dst2, dst3, 6);
470        SLLI_4V(dst4, dst5, dst6, dst7, 6);
471        SLLI_4V(dst8, dst9, dst10, dst11, 6);
472
473        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
474                                   7, dst0, dst1, dst2, dst3);
475        HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
476                                   7, dst4, dst5, dst6, dst7);
477        HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
478                                   dst11, 7, dst8, dst9, dst10, dst11);
479        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
480        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
481        ST_UB2(out0, out1, dst, 16);
482        ST_UB(out2, dst + 32);
483        dst += dst_stride;
484        ST_UB2(out3, out4, dst, 16);
485        ST_UB(out5, dst + 32);
486        dst += dst_stride;
487    }
488}
489
490static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
491                                 int32_t src_stride,
492                                 int16_t *src1_ptr,
493                                 int32_t src2_stride,
494                                 uint8_t *dst,
495                                 int32_t dst_stride,
496                                 int32_t height)
497{
498    uint32_t loop_cnt;
499    v16u8 out0, out1, out2, out3;
500    v16i8 src0, src1, src2, src3;
501    v16i8 zero = { 0 };
502    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
503    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
504
505    for (loop_cnt = height; loop_cnt--;) {
506        LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
507        src0_ptr += src_stride;
508        LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
509        src1_ptr += src2_stride;
510
511        ILVRL_B2_SH(zero, src0, dst0, dst1);
512        ILVRL_B2_SH(zero, src1, dst2, dst3);
513        ILVRL_B2_SH(zero, src2, dst4, dst5);
514        ILVRL_B2_SH(zero, src3, dst6, dst7);
515        SLLI_4V(dst0, dst1, dst2, dst3, 6);
516        SLLI_4V(dst4, dst5, dst6, dst7, 6);
517        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
518                                   7, dst0, dst1, dst2, dst3);
519        HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
520                                   7, dst4, dst5, dst6, dst7);
521        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
522        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
523
524        ST_UB4(out0, out1, out2, out3, dst, 16);
525        dst += dst_stride;
526    }
527}
528
529static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
530                                 int32_t src_stride,
531                                 int16_t *src1_ptr,
532                                 int32_t src2_stride,
533                                 uint8_t *dst,
534                                 int32_t dst_stride,
535                                 const int8_t *filter,
536                                 int32_t height)
537{
538    uint32_t loop_cnt;
539    v8i16 filt0, filt1, filt2, filt3;
540    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
541    v16i8 mask1, mask2, mask3;
542    v16i8 vec0, vec1, vec2, vec3;
543    v8i16 dst0, dst1, dst2, dst3;
544    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
545    v8i16 filter_vec, const_vec;
546    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
547
548    src0_ptr -= 3;
549
550    /* rearranging filter */
551    filter_vec = LD_SH(filter);
552    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
553
554    mask1 = mask0 + 2;
555    mask2 = mask0 + 4;
556    mask3 = mask0 + 6;
557
558    const_vec = __msa_ldi_h(128);
559    const_vec <<= 6;
560
561    for (loop_cnt = (height >> 3); loop_cnt--;) {
562        LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
563               src4, src5, src6, src7);
564        src0_ptr += (8 * src_stride);
565        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
566        src1_ptr += (8 * src2_stride);
567
568        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
569        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
570        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
571
572        dst0 = const_vec;
573        dst1 = const_vec;
574        dst2 = const_vec;
575        dst3 = const_vec;
576        VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
577        VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
578        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
579                     dst1, dst2, dst3);
580        VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
581        VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
582        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
583                     dst1, dst2, dst3);
584        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
585        VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
586        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
587                     dst1, dst2, dst3);
588        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
589        VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
590        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
591                     dst1, dst2, dst3);
592
593        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
594                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
595
596        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
597        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
598        dst += (8 * dst_stride);
599    }
600}
601
602static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
603                                 int32_t src_stride,
604                                 int16_t *src1_ptr,
605                                 int32_t src2_stride,
606                                 uint8_t *dst,
607                                 int32_t dst_stride,
608                                 const int8_t *filter,
609                                 int32_t height)
610{
611    uint32_t loop_cnt;
612    v8i16 filt0, filt1, filt2, filt3;
613    v16i8 src0, src1, src2, src3;
614    v16i8 mask1, mask2, mask3;
615    v16i8 vec0, vec1, vec2, vec3;
616    v8i16 dst0, dst1, dst2, dst3;
617    v8i16 in0, in1, in2, in3;
618    v8i16 filter_vec, const_vec;
619    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
620
621    src0_ptr -= 3;
622
623    const_vec = __msa_ldi_h(128);
624    const_vec <<= 6;
625
626    filter_vec = LD_SH(filter);
627    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
628
629    mask1 = mask0 + 2;
630    mask2 = mask0 + 4;
631    mask3 = mask0 + 6;
632
633    for (loop_cnt = (height >> 2); loop_cnt--;) {
634        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
635        src0_ptr += (4 * src_stride);
636        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
637        src1_ptr += (4 * src2_stride);
638        XORI_B4_128_SB(src0, src1, src2, src3);
639
640        dst0 = const_vec;
641        dst1 = const_vec;
642        dst2 = const_vec;
643        dst3 = const_vec;
644        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
645        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
646        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
647                     dst1, dst2, dst3);
648        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
649        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
650        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
651                     dst1, dst2, dst3);
652        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
653        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
654        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
655                     dst1, dst2, dst3);
656        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
657        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
658        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
659                     dst1, dst2, dst3);
660
661        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
662                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
663
664        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
665        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
666        dst += (4 * dst_stride);
667    }
668}
669
670static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
671                                  int32_t src_stride,
672                                  int16_t *src1_ptr,
673                                  int32_t src2_stride,
674                                  uint8_t *dst,
675                                  int32_t dst_stride,
676                                  const int8_t *filter,
677                                  int32_t height)
678{
679    uint32_t loop_cnt;
680    int32_t tmp0, tmp1;
681    int64_t tmp2, tmp3;
682    v16i8 src0, src1, src2, src3;
683    v16i8 vec0, vec1, vec2;
684    v8i16 filt0, filt1, filt2, filt3;
685    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
686    v8i16 dst0, dst1, dst2;
687    v8i16 in0, in1, in2, in3;
688    v8i16 filter_vec, const_vec;
689
690    src0_ptr -= 3;
691    const_vec = __msa_ldi_h(128);
692    const_vec <<= 6;
693
694    filter_vec = LD_SH(filter);
695    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696
697    mask0 = LD_SB(ff_hevc_mask_arr);
698    mask1 = mask0 + 2;
699    mask2 = mask0 + 4;
700    mask3 = mask0 + 6;
701    mask4 = LD_SB(&ff_hevc_mask_arr[16]);
702    mask5 = mask4 + 2;
703    mask6 = mask4 + 4;
704    mask7 = mask4 + 6;
705
706    for (loop_cnt = 8; loop_cnt--;) {
707        LD_SB2(src0_ptr, 8, src0, src1);
708        src0_ptr += src_stride;
709        LD_SB2(src0_ptr, 8, src2, src3);
710        src0_ptr += src_stride;
711        LD_SH2(src1_ptr, 8, in0, in1);
712        src1_ptr += src2_stride;
713        LD_SH2(src1_ptr, 8, in2, in3);
714        src1_ptr += src2_stride;
715        XORI_B4_128_SB(src0, src1, src2, src3);
716
717        dst0 = const_vec;
718        dst1 = const_vec;
719        dst2 = const_vec;
720
721        VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
722                   vec0, vec1, vec2);
723        DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1);
724        dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
725        VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
726                   vec0, vec1, vec2);
727        DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1);
728        dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
729        VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
730                   vec0, vec1, vec2);
731        DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1);
732        dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
733        VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
734                   vec0, vec1, vec2);
735        DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1);
736        dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
737
738        in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
739        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
740        dst2 = __msa_adds_s_h(in2, dst2);
741        dst2 = __msa_srari_h(dst2, 7);
742        CLIP_SH_0_255(dst2);
743        PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
744
745        tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
746        tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
747        tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
748        tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
749        SD(tmp2, dst);
750        SW(tmp0, dst + 8);
751        dst += dst_stride;
752        SD(tmp3, dst);
753        SW(tmp1, dst + 8);
754        dst += dst_stride;
755    }
756}
757
758static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
759                                  int32_t src_stride,
760                                  int16_t *src1_ptr,
761                                  int32_t src2_stride,
762                                  uint8_t *dst,
763                                  int32_t dst_stride,
764                                  const int8_t *filter,
765                                  int32_t height)
766{
767    uint32_t loop_cnt;
768    v16i8 src0, src1, src2, src3;
769    v8i16 filt0, filt1, filt2, filt3;
770    v16i8 mask1, mask2, mask3;
771    v16i8 vec0, vec1, vec2, vec3;
772    v8i16 dst0, dst1, dst2, dst3;
773    v8i16 in0, in1, in2, in3;
774    v8i16 filter_vec, const_vec;
775    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
776
777    src0_ptr -= 3;
778    const_vec = __msa_ldi_h(128);
779    const_vec <<= 6;
780
781    filter_vec = LD_SH(filter);
782    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
783
784    mask1 = mask0 + 2;
785    mask2 = mask0 + 4;
786    mask3 = mask0 + 6;
787
788    for (loop_cnt = (height >> 1); loop_cnt--;) {
789        LD_SB2(src0_ptr, 8, src0, src1);
790        src0_ptr += src_stride;
791        LD_SB2(src0_ptr, 8, src2, src3);
792        src0_ptr += src_stride;
793        LD_SH2(src1_ptr, 8, in0, in1);
794        src1_ptr += src2_stride;
795        LD_SH2(src1_ptr, 8, in2, in3);
796        src1_ptr += src2_stride;
797        XORI_B4_128_SB(src0, src1, src2, src3);
798
799        dst0 = const_vec;
800        dst1 = const_vec;
801        dst2 = const_vec;
802        dst3 = const_vec;
803        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
804        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
805        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
806                     dst1, dst2, dst3);
807        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
808        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
809        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
810                     dst1, dst2, dst3);
811        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
812        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
813        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
814                     dst1, dst2, dst3);
815        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
816        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
817        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
818                     dst1, dst2, dst3);
819
820        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
821                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
822
823        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
824        ST_SH2(dst0, dst1, dst, dst_stride);
825        dst += (2 * dst_stride);
826    }
827}
828
829static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
830                                  int32_t src_stride,
831                                  int16_t *src1_ptr,
832                                  int32_t src2_stride,
833                                  uint8_t *dst,
834                                  int32_t dst_stride,
835                                  const int8_t *filter,
836                                  int32_t height)
837{
838    uint32_t loop_cnt;
839    uint64_t dst_val0;
840    v16i8 src0, src1, tmp0, tmp1;
841    v8i16 filt0, filt1, filt2, filt3;
842    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
843    v16i8 vec0, vec1, vec2, vec3;
844    v8i16 dst0, dst1, dst2;
845    v8i16 in0, in1, in2;
846    v8i16 filter_vec, const_vec;
847    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
848
849    src0_ptr = src0_ptr - 3;
850    const_vec = __msa_ldi_h(128);
851    const_vec <<= 6;
852
853    filter_vec = LD_SH(filter);
854    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
855
856    mask1 = mask0 + 2;
857    mask2 = mask0 + 4;
858    mask3 = mask0 + 6;
859    mask4 = mask0 + 8;
860    mask5 = mask0 + 10;
861    mask6 = mask0 + 12;
862    mask7 = mask0 + 14;
863
864    for (loop_cnt = height; loop_cnt--;) {
865        LD_SB2(src0_ptr, 16, src0, src1);
866        src0_ptr += src_stride;
867        LD_SH2(src1_ptr, 8, in0, in1);
868        in2 = LD_SH(src1_ptr + 16);
869        src1_ptr += src2_stride;
870        XORI_B2_128_SB(src0, src1);
871
872        dst0 = const_vec;
873        dst1 = const_vec;
874        dst2 = const_vec;
875        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
876        VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
877        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
878                     dst1, dst2, dst0);
879        VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
880        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
881        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
882                     dst2, dst0, dst1);
883        VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
884        VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
885        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
886                     dst0, dst1, dst2);
887
888        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
889        dst2 = __msa_adds_s_h(dst2, in2);
890        dst2 = __msa_srari_h(dst2, 7);
891        CLIP_SH_0_255(dst2);
892
893        PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
894        dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
895        ST_SB(tmp0, dst);
896        SD(dst_val0, dst + 16);
897        dst += dst_stride;
898    }
899}
900
901static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
902                                  int32_t src_stride,
903                                  int16_t *src1_ptr,
904                                  int32_t src2_stride,
905                                  uint8_t *dst,
906                                  int32_t dst_stride,
907                                  const int8_t *filter,
908                                  int32_t height)
909{
910    uint32_t loop_cnt;
911    v16i8 src0, src1, src2, tmp0, tmp1;
912    v8i16 filt0, filt1, filt2, filt3;
913    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
914    v16i8 vec0, vec1, vec2, vec3;
915    v8i16 dst0, dst1, dst2, dst3;
916    v8i16 in0, in1, in2, in3;
917    v8i16 filter_vec, const_vec;
918    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
919
920    src0_ptr -= 3;
921    const_vec = __msa_ldi_h(128);
922    const_vec <<= 6;
923
924    filter_vec = LD_SH(filter);
925    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
926
927    mask1 = mask0 + 2;
928    mask2 = mask0 + 4;
929    mask3 = mask0 + 6;
930    mask4 = mask0 + 8;
931    mask5 = mask0 + 10;
932    mask6 = mask0 + 12;
933    mask7 = mask0 + 14;
934
935    for (loop_cnt = height; loop_cnt--;) {
936        LD_SB2(src0_ptr, 16, src0, src1);
937        src2 = LD_SB(src0_ptr + 24);
938        src0_ptr += src_stride;
939        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
940        src1_ptr += src2_stride;
941        XORI_B3_128_SB(src0, src1, src2);
942
943        dst0 = const_vec;
944        dst1 = const_vec;
945        dst2 = const_vec;
946        dst3 = const_vec;
947        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
948        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
949        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
950                     dst1, dst2, dst3);
951        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
952        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
953        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
954                     dst1, dst2, dst3);
955        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
956        VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
957        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
958                     dst1, dst2, dst3);
959        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
960        VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
961        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
962                     dst1, dst2, dst3);
963
964        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
965                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
966
967        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
968        ST_SB2(tmp0, tmp1, dst, 16);
969        dst += dst_stride;
970    }
971}
972
973static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
974                                  int32_t src_stride,
975                                  int16_t *src1_ptr,
976                                  int32_t src2_stride,
977                                  uint8_t *dst,
978                                  int32_t dst_stride,
979                                  const int8_t *filter,
980                                  int32_t height)
981{
982    uint32_t loop_cnt;
983    v16i8 src0, src1, src2, src3;
984    v16i8 tmp0, tmp1, tmp2;
985    v8i16 filt0, filt1, filt2, filt3;
986    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
987    v16i8 vec0, vec1, vec2, vec3;
988    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
989    v8i16 in0, in1, in2, in3, in4, in5;
990    v8i16 filter_vec, const_vec;
991    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
992
993    src0_ptr -= 3;
994
995    const_vec = __msa_ldi_h(128);
996    const_vec <<= 6;
997
998    filter_vec = LD_SH(filter);
999    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1000
1001    mask1 = mask0 + 2;
1002    mask2 = mask0 + 4;
1003    mask3 = mask0 + 6;
1004    mask4 = mask0 + 8;
1005    mask5 = mask0 + 10;
1006    mask6 = mask0 + 12;
1007    mask7 = mask0 + 14;
1008
1009    for (loop_cnt = 64; loop_cnt--;) {
1010        LD_SB3(src0_ptr, 16, src0, src1, src2);
1011        src3 = LD_SB(src0_ptr + 40);
1012        src0_ptr += src_stride;
1013        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1014        XORI_B4_128_SB(src0, src1, src2, src3);
1015
1016        dst0 = const_vec;
1017        dst1 = const_vec;
1018        dst2 = const_vec;
1019        dst3 = const_vec;
1020
1021        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1022        VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
1023        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1024                     dst1, dst2, dst3);
1025        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1026        VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
1027        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1028                     dst1, dst2, dst3);
1029        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1030        VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
1031        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1032                     dst1, dst2, dst3);
1033        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1034        VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
1035        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1036                     dst1, dst2, dst3);
1037        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
1038        HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
1039        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1040        ST_SB(tmp0, dst);
1041        ST_SB(tmp1, dst + 16);
1042
1043        LD_SH2(src1_ptr + 32, 8, in4, in5);
1044        src1_ptr += src2_stride;
1045
1046        dst4 = const_vec;
1047        dst5 = const_vec;
1048        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1049        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1050        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1051                     dst5, dst4, dst5);
1052        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1053        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1054        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1055                     dst5, dst4, dst5);
1056
1057        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
1058
1059        tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1060        ST_SB(tmp2, dst + 32);
1061        dst += dst_stride;
1062    }
1063}
1064
1065static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
1066                                  int32_t src_stride,
1067                                  int16_t *src1_ptr,
1068                                  int32_t src2_stride,
1069                                  uint8_t *dst,
1070                                  int32_t dst_stride,
1071                                  const int8_t *filter,
1072                                  int32_t height)
1073{
1074    uint32_t loop_cnt;
1075    v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1;
1076    v8i16 filt0, filt1, filt2, filt3;
1077    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1078    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1079    v16i8 vec0, vec1, vec2, vec3;
1080    v8i16 dst0, dst1, dst2, dst3;
1081    v8i16 in0, in1, in2, in3;
1082    v8i16 filter_vec, const_vec;
1083
1084    src0_ptr -= 3;
1085
1086    const_vec = __msa_ldi_h(128);
1087    const_vec <<= 6;
1088
1089    filter_vec = LD_SH(filter);
1090    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1091
1092    mask1 = mask0 + 2;
1093    mask2 = mask0 + 4;
1094    mask3 = mask0 + 6;
1095    mask4 = mask0 + 8;
1096    mask5 = mask0 + 10;
1097    mask6 = mask0 + 12;
1098    mask7 = mask0 + 14;
1099
1100    for (loop_cnt = height; loop_cnt--;) {
1101        LD_SB2(src0_ptr, 16, src0, src1);
1102        src2 = LD_SB(src0_ptr + 24);
1103        LD_SB2(src0_ptr + 32, 16, src3, src4);
1104        src5 = LD_SB(src0_ptr + 56);
1105        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1106        XORI_B3_128_SB(src0, src1, src2);
1107
1108        dst0 = const_vec;
1109        dst1 = const_vec;
1110        dst2 = const_vec;
1111        dst3 = const_vec;
1112
1113        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1114        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1115        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1116                     dst1, dst2, dst3);
1117        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1118        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1119        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1120                     dst1, dst2, dst3);
1121        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1122        VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1123        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1124                     dst1, dst2, dst3);
1125        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1126        VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1127        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1128                     dst1, dst2, dst3);
1129
1130        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1131                          dst0, dst1, dst2, dst3, 7,
1132                          dst0, dst1, dst2, dst3);
1133
1134        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1135        ST_SB2(tmp0, tmp1, dst, 16);
1136
1137        src0 = src3;
1138        src1 = src4;
1139        src2 = src5;
1140
1141        LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1142        XORI_B3_128_SB(src0, src1, src2);
1143
1144        dst0 = const_vec;
1145        dst1 = const_vec;
1146        dst2 = const_vec;
1147        dst3 = const_vec;
1148        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1149        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1150        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1151                     dst1, dst2, dst3);
1152        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1153        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1154        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1155                     dst1, dst2, dst3);
1156        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1157        VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1158        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1159                     dst1, dst2, dst3);
1160        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1161        VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1162        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1163                     dst1, dst2, dst3);
1164        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1165                          dst0, dst1, dst2, dst3, 7,
1166                          dst0, dst1, dst2, dst3);
1167        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1168        ST_SB2(tmp0, tmp1, dst + 32, 16);
1169        src1_ptr += src2_stride;
1170        src0_ptr += src_stride;
1171        dst += dst_stride;
1172    }
1173}
1174
1175static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
1176                                 int32_t src_stride,
1177                                 int16_t *src1_ptr,
1178                                 int32_t src2_stride,
1179                                 uint8_t *dst,
1180                                 int32_t dst_stride,
1181                                 const int8_t *filter,
1182                                 int32_t height)
1183{
1184    int32_t loop_cnt;
1185    v16i8 src0, src1, src2, src3, src4, src5;
1186    v16i8 src6, src7, src8, src9, src10;
1187    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1188    v16i8 src11, src12, src13, src14;
1189    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1190    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1191    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1192    v16i8 src2110, src4332, src6554, src8776, src10998;
1193    v16i8 src12111110, src14131312;
1194    v8i16 dst10, dst32, dst54, dst76;
1195    v8i16 filt0, filt1, filt2, filt3;
1196    v8i16 filter_vec, const_vec;
1197
1198    src0_ptr -= (3 * src_stride);
1199
1200    const_vec = __msa_ldi_h(128);
1201    const_vec <<= 6;
1202
1203    filter_vec = LD_SH(filter);
1204    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1205
1206    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1207    src0_ptr += (7 * src_stride);
1208    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1209               src10_r, src32_r, src54_r, src21_r);
1210    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1211    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1212               src2110, src4332, src6554);
1213    XORI_B3_128_SB(src2110, src4332, src6554);
1214
1215    for (loop_cnt = (height >> 3); loop_cnt--;) {
1216        LD_SB8(src0_ptr, src_stride,
1217               src7, src8, src9, src10, src11, src12, src13, src14);
1218        src0_ptr += (8 * src_stride);
1219        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1220        src1_ptr += (8 * src2_stride);
1221
1222        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1223        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1224        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1225                   src76_r, src87_r, src98_r, src109_r);
1226        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1227                   src1110_r, src1211_r, src1312_r, src1413_r);
1228        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1229                   src1413_r, src1312_r,
1230                   src8776, src10998, src12111110, src14131312);
1231        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1232
1233        dst10 = const_vec;
1234        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1235                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1236        dst32 = const_vec;
1237        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1238                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1239        dst54 = const_vec;
1240        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1241                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1242        dst76 = const_vec;
1243        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1244                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1245
1246        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1247                          dst10, dst32, dst54, dst76, 7,
1248                          dst10, dst32, dst54, dst76);
1249
1250        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1251        ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1252        dst += (8 * dst_stride);
1253
1254        src2110 = src10998;
1255        src4332 = src12111110;
1256        src6554 = src14131312;
1257        src6 = src14;
1258    }
1259}
1260
1261static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
1262                                 int32_t src_stride,
1263                                 int16_t *src1_ptr,
1264                                 int32_t src2_stride,
1265                                 uint8_t *dst,
1266                                 int32_t dst_stride,
1267                                 const int8_t *filter,
1268                                 int32_t height)
1269{
1270    int32_t loop_cnt;
1271    v16i8 src0, src1, src2, src3, src4, src5;
1272    v16i8 src6, src7, src8, src9, src10;
1273    v8i16 in0, in1, in2, in3;
1274    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1275    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1276    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1277    v8i16 filt0, filt1, filt2, filt3;
1278    v8i16 filter_vec, const_vec;
1279
1280    src0_ptr -= (3 * src_stride);
1281    const_vec = __msa_ldi_h(128);
1282    const_vec <<= 6;
1283
1284    filter_vec = LD_SH(filter);
1285    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1286
1287    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1288    src0_ptr += (7 * src_stride);
1289    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1290    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1291               src10_r, src32_r, src54_r, src21_r);
1292    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1293
1294    for (loop_cnt = (height >> 2); loop_cnt--;) {
1295        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1296        src0_ptr += (4 * src_stride);
1297        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1298        src1_ptr += (4 * src2_stride);
1299        XORI_B4_128_SB(src7, src8, src9, src10);
1300        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1301                   src76_r, src87_r, src98_r, src109_r);
1302
1303        dst0_r = const_vec;
1304        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1305                     filt0, filt1, filt2, filt3,
1306                     dst0_r, dst0_r, dst0_r, dst0_r);
1307        dst1_r = const_vec;
1308        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1309                     filt0, filt1, filt2, filt3,
1310                     dst1_r, dst1_r, dst1_r, dst1_r);
1311        dst2_r = const_vec;
1312        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1313                     filt0, filt1, filt2, filt3,
1314                     dst2_r, dst2_r, dst2_r, dst2_r);
1315        dst3_r = const_vec;
1316        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1317                     filt0, filt1, filt2, filt3,
1318                     dst3_r, dst3_r, dst3_r, dst3_r);
1319
1320        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1321                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
1322                          dst0_r, dst1_r, dst2_r, dst3_r);
1323
1324        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1325        ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1326        dst += (4 * dst_stride);
1327
1328        src10_r = src54_r;
1329        src32_r = src76_r;
1330        src54_r = src98_r;
1331        src21_r = src65_r;
1332        src43_r = src87_r;
1333        src65_r = src109_r;
1334
1335        src6 = src10;
1336    }
1337}
1338
1339static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
1340                                  int32_t src_stride,
1341                                  int16_t *src1_ptr,
1342                                  int32_t src2_stride,
1343                                  uint8_t *dst,
1344                                  int32_t dst_stride,
1345                                  const int8_t *filter,
1346                                  int32_t height)
1347{
1348    int32_t loop_cnt;
1349    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1350    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1351    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1352    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1353    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1354    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1355    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1356    v16i8 src2110, src4332, src6554, src8776, src10998;
1357    v8i16 dst0_l, dst1_l;
1358    v8i16 filt0, filt1, filt2, filt3;
1359    v8i16 filter_vec, const_vec;
1360
1361    src0_ptr -= (3 * src_stride);
1362    const_vec = __msa_ldi_h(128);
1363    const_vec <<= 6;
1364
1365    filter_vec = LD_SH(filter);
1366    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1367
1368    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1369    src0_ptr += (7 * src_stride);
1370    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1371
1372    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1373               src10_r, src32_r, src54_r, src21_r);
1374    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1375    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1376               src10_l, src32_l, src54_l, src21_l);
1377    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1378    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1379               src2110, src4332, src6554);
1380
1381    for (loop_cnt = (height >> 2); loop_cnt--;) {
1382        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1383        src0_ptr += (4 * src_stride);
1384        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1385        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1386        src1_ptr += (4 * src2_stride);
1387
1388        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
1389        XORI_B4_128_SB(src7, src8, src9, src10);
1390        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1391                   src76_r, src87_r, src98_r, src109_r);
1392        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1393                   src76_l, src87_l, src98_l, src109_l);
1394        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1395
1396        dst0_r = const_vec;
1397        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1398                     filt0, filt1, filt2, filt3,
1399                     dst0_r, dst0_r, dst0_r, dst0_r);
1400        dst1_r = const_vec;
1401        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1402                     filt0, filt1, filt2, filt3,
1403                     dst1_r, dst1_r, dst1_r, dst1_r);
1404        dst2_r = const_vec;
1405        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1406                     filt0, filt1, filt2, filt3,
1407                     dst2_r, dst2_r, dst2_r, dst2_r);
1408        dst3_r = const_vec;
1409        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1410                     filt0, filt1, filt2, filt3,
1411                     dst3_r, dst3_r, dst3_r, dst3_r);
1412        dst0_l = const_vec;
1413        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1414                     filt0, filt1, filt2, filt3,
1415                     dst0_l, dst0_l, dst0_l, dst0_l);
1416        dst1_l = const_vec;
1417        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1418                     filt0, filt1, filt2, filt3,
1419                     dst1_l, dst1_l, dst1_l, dst1_l);
1420
1421        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1422                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
1423                          dst0_r, dst1_r, dst2_r, dst3_r);
1424        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
1425
1426
1427        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1428        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1429        ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1430        ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
1431        dst += (4 * dst_stride);
1432
1433        src10_r = src54_r;
1434        src32_r = src76_r;
1435        src54_r = src98_r;
1436        src21_r = src65_r;
1437        src43_r = src87_r;
1438        src65_r = src109_r;
1439        src2110 = src6554;
1440        src4332 = src8776;
1441        src6554 = src10998;
1442        src6 = src10;
1443    }
1444}
1445
1446static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
1447                                           int32_t src_stride,
1448                                           int16_t *src1_ptr,
1449                                           int32_t src2_stride,
1450                                           uint8_t *dst,
1451                                           int32_t dst_stride,
1452                                           const int8_t *filter,
1453                                           int32_t height, int32_t width)
1454{
1455    uint8_t *src0_ptr_tmp;
1456    int16_t *src1_ptr_tmp;
1457    uint8_t *dst_tmp;
1458    uint32_t loop_cnt;
1459    uint32_t cnt;
1460    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1461    v8i16 in0, in1, in2, in3;
1462    v16i8 src10_r, src32_r, src54_r, src76_r;
1463    v16i8 src21_r, src43_r, src65_r, src87_r;
1464    v8i16 dst0_r, dst1_r;
1465    v16i8 src10_l, src32_l, src54_l, src76_l;
1466    v16i8 src21_l, src43_l, src65_l, src87_l;
1467    v8i16 dst0_l, dst1_l;
1468    v8i16 filt0, filt1, filt2, filt3;
1469    v8i16 filter_vec, const_vec;
1470
1471    src0_ptr -= (3 * src_stride);
1472    const_vec = __msa_ldi_h(128);
1473    const_vec <<= 6;
1474
1475    filter_vec = LD_SH(filter);
1476    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1477
1478    for (cnt = (width >> 4); cnt--;) {
1479        src0_ptr_tmp = src0_ptr;
1480        src1_ptr_tmp = src1_ptr;
1481        dst_tmp = dst;
1482
1483        LD_SB7(src0_ptr_tmp, src_stride,
1484               src0, src1, src2, src3, src4, src5, src6);
1485        src0_ptr_tmp += (7 * src_stride);
1486        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1487
1488        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1489                   src10_r, src32_r, src54_r, src21_r);
1490        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1491        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1492                   src10_l, src32_l, src54_l, src21_l);
1493        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1494
1495        for (loop_cnt = (height >> 1); loop_cnt--;) {
1496            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1497            src0_ptr_tmp += (2 * src_stride);
1498            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1499            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1500            src1_ptr_tmp += (2 * src2_stride);
1501            XORI_B2_128_SB(src7, src8);
1502
1503            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1504            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1505
1506            dst0_r = const_vec;
1507            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1508                         filt0, filt1, filt2, filt3,
1509                         dst0_r, dst0_r, dst0_r, dst0_r);
1510            dst1_r = const_vec;
1511            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1512                         filt0, filt1, filt2, filt3,
1513                         dst1_r, dst1_r, dst1_r, dst1_r);
1514            dst0_l = const_vec;
1515            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1516                         filt0, filt1, filt2, filt3,
1517                         dst0_l, dst0_l, dst0_l, dst0_l);
1518            dst1_l = const_vec;
1519            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1520                         filt0, filt1, filt2, filt3,
1521                         dst1_l, dst1_l, dst1_l, dst1_l);
1522
1523            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1524                              dst0_r, dst1_r, dst0_l, dst1_l, 7,
1525                              dst0_r, dst1_r, dst0_l, dst1_l);
1526
1527            PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1528            ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1529            dst_tmp += (2 * dst_stride);
1530
1531            src10_r = src32_r;
1532            src32_r = src54_r;
1533            src54_r = src76_r;
1534            src21_r = src43_r;
1535            src43_r = src65_r;
1536            src65_r = src87_r;
1537            src10_l = src32_l;
1538            src32_l = src54_l;
1539            src54_l = src76_l;
1540            src21_l = src43_l;
1541            src43_l = src65_l;
1542            src65_l = src87_l;
1543            src6 = src8;
1544        }
1545
1546        src0_ptr += 16;
1547        src1_ptr += 16;
1548        dst += 16;
1549    }
1550}
1551
1552static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
1553                                  int32_t src_stride,
1554                                  int16_t *src1_ptr,
1555                                  int32_t src2_stride,
1556                                  uint8_t *dst,
1557                                  int32_t dst_stride,
1558                                  const int8_t *filter,
1559                                  int32_t height)
1560{
1561    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1562                                   dst, dst_stride, filter, height, 16);
1563}
1564
1565static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
1566                                  int32_t src_stride,
1567                                  int16_t *src1_ptr,
1568                                  int32_t src2_stride,
1569                                  uint8_t *dst,
1570                                  int32_t dst_stride,
1571                                  const int8_t *filter,
1572                                  int32_t height)
1573{
1574    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1575                                   dst, dst_stride, filter, height, 16);
1576    hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1577                         dst + 16, dst_stride, filter, height);
1578}
1579
1580static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
1581                                  int32_t src_stride,
1582                                  int16_t *src1_ptr,
1583                                  int32_t src2_stride,
1584                                  uint8_t *dst,
1585                                  int32_t dst_stride,
1586                                  const int8_t *filter,
1587                                  int32_t height)
1588{
1589    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1590                                   dst, dst_stride, filter, height, 32);
1591}
1592
1593static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
1594                                  int32_t src_stride,
1595                                  int16_t *src1_ptr,
1596                                  int32_t src2_stride,
1597                                  uint8_t *dst,
1598                                  int32_t dst_stride,
1599                                  const int8_t *filter,
1600                                  int32_t height)
1601{
1602    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1603                                   dst, dst_stride, filter, height, 48);
1604}
1605
1606static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
1607                                  int32_t src_stride,
1608                                  int16_t *src1_ptr,
1609                                  int32_t src2_stride,
1610                                  uint8_t *dst,
1611                                  int32_t dst_stride,
1612                                  const int8_t *filter,
1613                                  int32_t height)
1614{
1615    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1616                                   dst, dst_stride, filter, height, 64);
1617}
1618
1619static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
1620                                 int32_t src_stride,
1621                                 int16_t *src1_ptr,
1622                                 int32_t src2_stride,
1623                                 uint8_t *dst,
1624                                 int32_t dst_stride,
1625                                 const int8_t *filter_x,
1626                                 const int8_t *filter_y,
1627                                 int32_t height)
1628{
1629    uint32_t loop_cnt;
1630    uint64_t tp0, tp1;
1631    v16u8 out;
1632    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1633    v8i16 in0 = { 0 }, in1 = { 0 };
1634    v8i16 filt0, filt1, filt2, filt3;
1635    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1636    v16i8 mask1, mask2, mask3;
1637    v8i16 filter_vec, const_vec;
1638    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1639    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1640    v8i16 out0, out1;
1641    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1642    v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1643    v4i32 dst0, dst1, dst2, dst3;
1644    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1645
1646    src0_ptr -= ((3 * src_stride) + 3);
1647    filter_vec = LD_SH(filter_x);
1648    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1649
1650    filter_vec = LD_SH(filter_y);
1651    UNPCK_R_SB_SH(filter_vec, filter_vec);
1652
1653    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1654
1655    mask1 = mask0 + 2;
1656    mask2 = mask0 + 4;
1657    mask3 = mask0 + 6;
1658
1659    const_vec = __msa_ldi_h(128);
1660    const_vec <<= 6;
1661
1662    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1663    src0_ptr += (7 * src_stride);
1664    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1665
1666    /* row 0 row 1 row 2 row 3 */
1667    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1668    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1669    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1670               vec8, vec9, vec10, vec11);
1671    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1672               vec12, vec13, vec14, vec15);
1673
1674    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1675                              filt3);
1676    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1677                              filt3);
1678    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1679                              filt3);
1680    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1681                              filt3);
1682
1683    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
1684    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
1685    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
1686
1687    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1688
1689    for (loop_cnt = height >> 2; loop_cnt--;) {
1690        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1691        src0_ptr += (4 * src_stride);
1692        XORI_B4_128_SB(src7, src8, src9, src10);
1693
1694        LD2(src1_ptr, src2_stride, tp0, tp1);
1695        INSERT_D2_SH(tp0, tp1, in0);
1696        src1_ptr += (2 * src2_stride);
1697        LD2(src1_ptr, src2_stride, tp0, tp1);
1698        INSERT_D2_SH(tp0, tp1, in1);
1699        src1_ptr += (2 * src2_stride);
1700
1701        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1702                   vec0, vec1, vec2, vec3);
1703        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1704                   vec4, vec5, vec6, vec7);
1705        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1706                                  filt3);
1707        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1708                                   filt3);
1709
1710        dst76 = __msa_ilvr_h(dst97, dst66);
1711        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
1712        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1713        dst98 = __msa_ilvr_h(dst66, dst108);
1714
1715        dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1716                              filt_h2, filt_h3);
1717        dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1718                              filt_h2, filt_h3);
1719        dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1720                              filt_h2, filt_h3);
1721        dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1722                              filt_h2, filt_h3);
1723
1724        SRA_4V(dst0, dst1, dst2, dst3, 6);
1725        PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1);
1726        ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
1727        ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1728        SRARI_H2_SH(out0, out1, 7);
1729        CLIP_SH2_0_255(out0, out1);
1730        out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1731        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1732        dst += (4 * dst_stride);
1733
1734        dst10 = dst54;
1735        dst32 = dst76;
1736        dst54 = dst98;
1737        dst21 = dst65;
1738        dst43 = dst87;
1739        dst65 = dst109;
1740        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1741    }
1742}
1743
1744static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
1745                                          int32_t src_stride,
1746                                          int16_t *src1_ptr,
1747                                          int32_t src2_stride,
1748                                          uint8_t *dst,
1749                                          int32_t dst_stride,
1750                                          const int8_t *filter_x,
1751                                          const int8_t *filter_y,
1752                                          int32_t height, int32_t width)
1753{
1754    uint32_t loop_cnt;
1755    uint32_t cnt;
1756    uint8_t *src0_ptr_tmp;
1757    int16_t *src1_ptr_tmp;
1758    uint8_t *dst_tmp;
1759    v16u8 out;
1760    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1761    v8i16 in0, tmp;
1762    v8i16 filt0, filt1, filt2, filt3;
1763    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1764    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1765    v16i8 mask1, mask2, mask3;
1766    v8i16 filter_vec, const_vec;
1767    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1768    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1769    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1770    v4i32 dst0_r, dst0_l;
1771    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1772    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1773
1774    src0_ptr -= ((3 * src_stride) + 3);
1775    const_vec = __msa_ldi_h(128);
1776    const_vec <<= 6;
1777
1778    filter_vec = LD_SH(filter_x);
1779    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1780
1781    filter_vec = LD_SH(filter_y);
1782    UNPCK_R_SB_SH(filter_vec, filter_vec);
1783
1784    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1785
1786    mask1 = mask0 + 2;
1787    mask2 = mask0 + 4;
1788    mask3 = mask0 + 6;
1789
1790    for (cnt = width >> 3; cnt--;) {
1791        src0_ptr_tmp = src0_ptr;
1792        dst_tmp = dst;
1793        src1_ptr_tmp = src1_ptr;
1794
1795        LD_SB7(src0_ptr_tmp, src_stride,
1796               src0, src1, src2, src3, src4, src5, src6);
1797        src0_ptr_tmp += (7 * src_stride);
1798        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1799
1800        /* row 0 row 1 row 2 row 3 */
1801        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1802                   vec0, vec1, vec2, vec3);
1803        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1804                   vec4, vec5, vec6, vec7);
1805        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1806                   vec8, vec9, vec10, vec11);
1807        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1808                   vec12, vec13, vec14, vec15);
1809        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1810                                 filt3);
1811        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1812                                 filt3);
1813        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1814                                 filt3);
1815        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1816                                 filt2, filt3);
1817
1818        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1819                   vec0, vec1, vec2, vec3);
1820        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1821                   vec4, vec5, vec6, vec7);
1822        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1823                   vec8, vec9, vec10, vec11);
1824        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1825                                 filt3);
1826        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1827                                 filt3);
1828        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1829                                 filt3);
1830
1831        for (loop_cnt = height; loop_cnt--;) {
1832            src7 = LD_SB(src0_ptr_tmp);
1833            src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1834            src0_ptr_tmp += src_stride;
1835
1836            in0 = LD_SH(src1_ptr_tmp);
1837            src1_ptr_tmp += src2_stride;
1838
1839            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1840                       vec0, vec1, vec2, vec3);
1841            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1842                                     filt2, filt3);
1843            ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1844            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1845            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1846            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1847            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1848                                    filt_h0, filt_h1, filt_h2, filt_h3);
1849            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1850                                    filt_h0, filt_h1, filt_h2, filt_h3);
1851            dst0_r >>= 6;
1852            dst0_l >>= 6;
1853
1854            tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1855            ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
1856            tmp = __msa_srari_h(tmp, 7);
1857            CLIP_SH_0_255(tmp);
1858            out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
1859            ST_D1(out, 0, dst_tmp);
1860            dst_tmp += dst_stride;
1861
1862            dst0 = dst1;
1863            dst1 = dst2;
1864            dst2 = dst3;
1865            dst3 = dst4;
1866            dst4 = dst5;
1867            dst5 = dst6;
1868            dst6 = dst7;
1869        }
1870
1871        src0_ptr += 8;
1872        dst += 8;
1873        src1_ptr += 8;
1874    }
1875}
1876
1877static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
1878                                 int32_t src_stride,
1879                                 int16_t *src1_ptr,
1880                                 int32_t src2_stride,
1881                                 uint8_t *dst,
1882                                 int32_t dst_stride,
1883                                 const int8_t *filter_x,
1884                                 const int8_t *filter_y,
1885                                 int32_t height)
1886{
1887    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1888                                  dst, dst_stride, filter_x, filter_y,
1889                                  height, 8);
1890}
1891
1892static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
1893                                  int32_t src_stride,
1894                                  int16_t *src1_ptr,
1895                                  int32_t src2_stride,
1896                                  uint8_t *dst,
1897                                  int32_t dst_stride,
1898                                  const int8_t *filter_x,
1899                                  const int8_t *filter_y,
1900                                  int32_t height)
1901{
1902    uint32_t loop_cnt;
1903    uint8_t *src0_ptr_tmp, *dst_tmp;
1904    int16_t *src1_ptr_tmp;
1905    uint64_t tp0, tp1;
1906    v16u8 out;
1907    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1908    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1909    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1910    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1911    v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec;
1912    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1913    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1914    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1915    v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1916    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1917    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1918    v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
1919
1920    src0_ptr -= ((3 * src_stride) + 3);
1921
1922    const_vec = __msa_ldi_h(128);
1923    const_vec <<= 6;
1924
1925    filter_vec = LD_SH(filter_x);
1926    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1927
1928    filter_vec = LD_SH(filter_y);
1929    UNPCK_R_SB_SH(filter_vec, filter_vec);
1930
1931    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1932
1933    mask0 = LD_SB(ff_hevc_mask_arr);
1934    mask1 = mask0 + 2;
1935    mask2 = mask0 + 4;
1936    mask3 = mask0 + 6;
1937
1938    src0_ptr_tmp = src0_ptr;
1939    dst_tmp = dst;
1940    src1_ptr_tmp = src1_ptr;
1941
1942    LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
1943           src6);
1944    src0_ptr_tmp += (7 * src_stride);
1945    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1946
1947    /* row 0 row 1 row 2 row 3 */
1948    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1949               vec3);
1950    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1951               vec7);
1952    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1953               vec11);
1954    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1955               vec15);
1956    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1957                             filt3);
1958    dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1959                             filt3);
1960    dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1961                             filt3);
1962    dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1963                             filt2, filt3);
1964    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1965               vec3);
1966    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1967               vec7);
1968    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1969               vec11);
1970    dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1971                             filt3);
1972    dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1973                             filt3);
1974    dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1975                             filt3);
1976
1977    for (loop_cnt = 16; loop_cnt--;) {
1978        src7 = LD_SB(src0_ptr_tmp);
1979        src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1980        src0_ptr_tmp += src_stride;
1981
1982        in0 = LD_SH(src1_ptr_tmp);
1983        src1_ptr_tmp += src2_stride;
1984
1985        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1986                   vec3);
1987        dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1988                                 filt2, filt3);
1989        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1990        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1991        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1992        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1993        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1994                                filt_h1, filt_h2, filt_h3);
1995        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1996                                filt_h1, filt_h2, filt_h3);
1997        dst0_r >>= 6;
1998        dst0_l >>= 6;
1999
2000        tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2001        ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
2002        tmp = __msa_srari_h(tmp, 7);
2003        CLIP_SH_0_255(tmp);
2004        out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
2005        ST_D1(out, 0, dst_tmp);
2006        dst_tmp += dst_stride;
2007
2008        dst0 = dst1;
2009        dst1 = dst2;
2010        dst2 = dst3;
2011        dst3 = dst4;
2012        dst4 = dst5;
2013        dst5 = dst6;
2014        dst6 = dst7;
2015    }
2016
2017    src0_ptr += 8;
2018    dst += 8;
2019    src1_ptr += 8;
2020
2021    mask4 = LD_SB(ff_hevc_mask_arr + 16);
2022    mask5 = mask4 + 2;
2023    mask6 = mask4 + 4;
2024    mask7 = mask4 + 6;
2025
2026    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2027    src0_ptr += (7 * src_stride);
2028    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2029
2030    /* row 0 row 1 row 2 row 3 */
2031    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2032    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2033    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2034               vec8, vec9, vec10, vec11);
2035    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2036               vec12, vec13, vec14, vec15);
2037    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2038                              filt3);
2039    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2040                              filt3);
2041    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2042                              filt3);
2043    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2044                              filt3);
2045
2046    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2047    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2048    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2049
2050    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2051
2052    for (loop_cnt = 4; loop_cnt--;) {
2053        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2054        src0_ptr += (4 * src_stride);
2055        XORI_B4_128_SB(src7, src8, src9, src10);
2056
2057        LD2(src1_ptr, src2_stride, tp0, tp1);
2058        INSERT_D2_SH(tp0, tp1, in0);
2059        src1_ptr += (2 * src2_stride);
2060        LD2(src1_ptr, src2_stride, tp0, tp1);
2061        INSERT_D2_SH(tp0, tp1, in1);
2062        src1_ptr += (2 * src2_stride);
2063
2064        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2065                   vec3);
2066        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2067                   vec7);
2068        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2069                                  filt3);
2070        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2071                                   filt3);
2072
2073        dst76 = __msa_ilvr_h(dst97, dst66);
2074        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2075        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2076        dst98 = __msa_ilvr_h(dst66, dst108);
2077
2078        tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2079                              filt_h2, filt_h3);
2080        tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2081                              filt_h2, filt_h3);
2082        tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2083                              filt_h2, filt_h3);
2084        tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2085                              filt_h2, filt_h3);
2086        SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2087        PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1);
2088        ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
2089        ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2090        SRARI_H2_SH(out0, out1, 7);
2091        CLIP_SH2_0_255(out0, out1);
2092        out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2093        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2094        dst += (4 * dst_stride);
2095
2096        dst10 = dst54;
2097        dst32 = dst76;
2098        dst54 = dst98;
2099        dst21 = dst65;
2100        dst43 = dst87;
2101        dst65 = dst109;
2102        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2103    }
2104}
2105
2106static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
2107                                  int32_t src_stride,
2108                                  int16_t *src1_ptr,
2109                                  int32_t src2_stride,
2110                                  uint8_t *dst,
2111                                  int32_t dst_stride,
2112                                  const int8_t *filter_x,
2113                                  const int8_t *filter_y,
2114                                  int32_t height)
2115{
2116    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2117                                  dst, dst_stride, filter_x, filter_y,
2118                                  height, 16);
2119}
2120
2121static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
2122                                  int32_t src_stride,
2123                                  int16_t *src1_ptr,
2124                                  int32_t src2_stride,
2125                                  uint8_t *dst,
2126                                  int32_t dst_stride,
2127                                  const int8_t *filter_x,
2128                                  const int8_t *filter_y,
2129                                  int32_t height)
2130{
2131    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2132                                  dst, dst_stride, filter_x, filter_y,
2133                                  height, 24);
2134}
2135
2136static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
2137                                  int32_t src_stride,
2138                                  int16_t *src1_ptr,
2139                                  int32_t src2_stride,
2140                                  uint8_t *dst,
2141                                  int32_t dst_stride,
2142                                  const int8_t *filter_x,
2143                                  const int8_t *filter_y,
2144                                  int32_t height)
2145{
2146    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2147                                  dst, dst_stride, filter_x, filter_y,
2148                                  height, 32);
2149}
2150
2151static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
2152                                  int32_t src_stride,
2153                                  int16_t *src1_ptr,
2154                                  int32_t src2_stride,
2155                                  uint8_t *dst,
2156                                  int32_t dst_stride,
2157                                  const int8_t *filter_x,
2158                                  const int8_t *filter_y,
2159                                  int32_t height)
2160{
2161    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2162                                  dst, dst_stride, filter_x, filter_y,
2163                                  height, 48);
2164}
2165
2166static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
2167                                  int32_t src_stride,
2168                                  int16_t *src1_ptr,
2169                                  int32_t src2_stride,
2170                                  uint8_t *dst,
2171                                  int32_t dst_stride,
2172                                  const int8_t *filter_x,
2173                                  const int8_t *filter_y,
2174                                  int32_t height)
2175{
2176    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2177                                  dst, dst_stride, filter_x, filter_y,
2178                                  height, 64);
2179}
2180
2181static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
2182                                  int32_t src_stride,
2183                                  int16_t *src1_ptr,
2184                                  int32_t src2_stride,
2185                                  uint8_t *dst,
2186                                  int32_t dst_stride,
2187                                  const int8_t *filter,
2188                                  int32_t height)
2189{
2190    v8i16 filt0, filt1;
2191    v16i8 src0, src1, dst0, vec0, vec1;
2192    v8i16 in0, in1;
2193    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2194    v16i8 mask1;
2195    v8i16 tmp0;
2196    v8i16 filter_vec, const_vec;
2197
2198    src0_ptr -= 1;
2199
2200    const_vec = __msa_ldi_h(128);
2201    const_vec <<= 6;
2202
2203    filter_vec = LD_SH(filter);
2204    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2205
2206    mask1 = mask0 + 2;
2207
2208    LD_SB2(src0_ptr, src_stride, src0, src1);
2209    LD_SH2(src1_ptr, src2_stride, in0, in1);
2210    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2211    XORI_B2_128_SB(src0, src1);
2212    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2213    tmp0 = const_vec;
2214    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2215
2216    tmp0 = __msa_adds_s_h(tmp0, in0);
2217    tmp0 = __msa_srari_h(tmp0, 7);
2218    CLIP_SH_0_255(tmp0);
2219    dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2220
2221    ST_W2(dst0, 0, 1, dst, dst_stride);
2222}
2223
2224static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
2225                                  int32_t src_stride,
2226                                  int16_t *src1_ptr,
2227                                  int32_t src2_stride,
2228                                  uint8_t *dst,
2229                                  int32_t dst_stride,
2230                                  const int8_t *filter,
2231                                  int32_t height)
2232{
2233    v8i16 filt0, filt1;
2234    v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
2235    v8i16 in0, in1, in2, in3;
2236    v16i8 vec2, vec3;
2237    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2238    v16i8 mask1;
2239    v8i16 tmp0, tmp1;
2240    v8i16 filter_vec, const_vec;
2241
2242    src0_ptr -= 1;
2243
2244    const_vec = __msa_ldi_h(128);
2245    const_vec <<= 6;
2246
2247    filter_vec = LD_SH(filter);
2248    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2249
2250    mask1 = mask0 + 2;
2251
2252    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2253    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2254
2255    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2256    XORI_B4_128_SB(src0, src1, src2, src3);
2257
2258    tmp0 = const_vec;
2259    tmp1 = const_vec;
2260    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2261    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2262    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2263                 tmp0, tmp1);
2264    HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
2265    dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2266
2267    ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2268}
2269
2270static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2271                                          int32_t src_stride,
2272                                          int16_t *src1_ptr,
2273                                          int32_t src2_stride,
2274                                          uint8_t *dst,
2275                                          int32_t dst_stride,
2276                                          const int8_t *filter,
2277                                          int32_t height)
2278{
2279    uint32_t loop_cnt;
2280    v8i16 filt0, filt1;
2281    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2282    v16i8 dst0, dst1;
2283    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2284    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2285    v16i8 mask1, vec0, vec1, vec2, vec3;
2286    v8i16 tmp0, tmp1, tmp2, tmp3;
2287    v8i16 filter_vec, const_vec;
2288
2289    src0_ptr -= 1;
2290
2291    const_vec = __msa_ldi_h(128);
2292    const_vec <<= 6;
2293
2294    filter_vec = LD_SH(filter);
2295    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2296
2297    mask1 = mask0 + 2;
2298
2299    for (loop_cnt = (height >> 3); loop_cnt--;) {
2300        LD_SB8(src0_ptr, src_stride,
2301               src0, src1, src2, src3, src4, src5, src6, src7);
2302        src0_ptr += (8 * src_stride);
2303        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2304        src1_ptr += (4 * src2_stride);
2305        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2306        src1_ptr += (4 * src2_stride);
2307        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2308        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2309        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2310
2311        tmp0 = const_vec;
2312        tmp1 = const_vec;
2313        tmp2 = const_vec;
2314        tmp3 = const_vec;
2315        VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2316        VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2317        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2318                     tmp1, tmp2, tmp3);
2319        VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
2320        VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2321        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2322                     tmp1, tmp2, tmp3);
2323
2324        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2325                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2326
2327        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2328        ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2329        dst += (8 * dst_stride);
2330    }
2331}
2332
2333static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
2334                                 int32_t src_stride,
2335                                 int16_t *src1_ptr,
2336                                 int32_t src2_stride,
2337                                 uint8_t *dst,
2338                                 int32_t dst_stride,
2339                                 const int8_t *filter,
2340                                 int32_t height)
2341{
2342    if (2 == height) {
2343        hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2344                              dst, dst_stride, filter, height);
2345    } else if (4 == height) {
2346        hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2347                              dst, dst_stride, filter, height);
2348    } else if (8 == height || 16 == height) {
2349        hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2350                                      src1_ptr, src2_stride,
2351                                      dst, dst_stride, filter, height);
2352    }
2353}
2354
2355static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
2356                                 int32_t src_stride,
2357                                 int16_t *src1_ptr,
2358                                 int32_t src2_stride,
2359                                 uint8_t *dst,
2360                                 int32_t dst_stride,
2361                                 const int8_t *filter,
2362                                 int32_t height)
2363{
2364    uint32_t loop_cnt;
2365    v8i16 filt0, filt1;
2366    v16i8 src0, src1, src2, src3;
2367    v8i16 in0, in1, in2, in3;
2368    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2369    v16i8 mask1;
2370    v16i8 vec0, vec1, vec2, vec3;
2371    v8i16 dst0, dst1, dst2, dst3;
2372    v8i16 filter_vec, const_vec;
2373
2374    src0_ptr -= 1;
2375
2376    const_vec = __msa_ldi_h(128);
2377    const_vec <<= 6;
2378
2379    filter_vec = LD_SH(filter);
2380    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2381
2382    mask1 = mask0 + 2;
2383
2384    for (loop_cnt = (height >> 2); loop_cnt--;) {
2385        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2386        src0_ptr += (4 * src_stride);
2387        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2388        src1_ptr += (4 * src2_stride);
2389        XORI_B4_128_SB(src0, src1, src2, src3);
2390
2391        dst0 = const_vec;
2392        dst1 = const_vec;
2393        dst2 = const_vec;
2394        dst3 = const_vec;
2395        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2396        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2397        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2398                     dst1, dst2, dst3);
2399        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2400        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2401        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2402                     dst1, dst2, dst3);
2403
2404        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2405                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2406
2407        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2408        ST_W2(dst0, 0, 2, dst, dst_stride);
2409        ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2410        ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2411        ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2412        dst += (4 * dst_stride);
2413    }
2414}
2415
2416static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
2417                                  int32_t src_stride,
2418                                  int16_t *src1_ptr,
2419                                  int32_t src2_stride,
2420                                  uint8_t *dst,
2421                                  int32_t dst_stride,
2422                                  const int8_t *filter,
2423                                  int32_t height)
2424{
2425    v8i16 filt0, filt1;
2426    v16i8 src0, src1;
2427    v8i16 in0, in1;
2428    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2429    v16i8 mask1, vec0, vec1, vec2, vec3;
2430    v8i16 dst0, dst1;
2431    v8i16 filter_vec, const_vec;
2432
2433    src0_ptr -= 1;
2434
2435    const_vec = __msa_ldi_h(128);
2436    const_vec <<= 6;
2437
2438    filter_vec = LD_SH(filter);
2439    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2440
2441    mask1 = mask0 + 2;
2442
2443    LD_SB2(src0_ptr, src_stride, src0, src1);
2444    LD_SH2(src1_ptr, src2_stride, in0, in1);
2445    XORI_B2_128_SB(src0, src1);
2446
2447    dst0 = const_vec;
2448    dst1 = const_vec;
2449    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2450    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2451    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2452                 dst0, dst1);
2453    HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2454
2455    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456    ST_D2(dst0, 0, 1, dst, dst_stride);
2457}
2458
2459static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
2460                                  int32_t src_stride,
2461                                  int16_t *src1_ptr,
2462                                  int32_t src2_stride,
2463                                  uint8_t *dst,
2464                                  int32_t dst_stride,
2465                                  const int8_t *filter,
2466                                  int32_t height)
2467{
2468    v8i16 filt0, filt1;
2469    v16i8 src0, src1, src2, src3, src4, src5;
2470    v8i16 in0, in1, in2, in3, in4, in5;
2471    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2472    v16i8 mask1;
2473    v16i8 vec0, vec1, vec2, vec3;
2474    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2475    v8i16 filter_vec, const_vec;
2476
2477    src0_ptr -= 1;
2478
2479    const_vec = __msa_ldi_h(128);
2480    const_vec <<= 6;
2481
2482    filter_vec = LD_SH(filter);
2483    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2484
2485    mask1 = mask0 + 2;
2486
2487    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2488    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2489    src1_ptr += (4 * src2_stride);
2490    LD_SH2(src1_ptr, src2_stride, in4, in5);
2491    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2492
2493    dst0 = const_vec;
2494    dst1 = const_vec;
2495    dst2 = const_vec;
2496    dst3 = const_vec;
2497    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2498    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2499    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2500                 dst2, dst3);
2501    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2502    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2503    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2504                 dst2, dst3);
2505    dst4 = const_vec;
2506    dst5 = const_vec;
2507
2508    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2509    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2510    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2511                 dst4, dst5);
2512
2513    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2514                      dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2515    HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2516
2517    PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2518    dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2519    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2520    ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
2521}
2522
2523static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
2524                                          int32_t src_stride,
2525                                          int16_t *src1_ptr,
2526                                          int32_t src2_stride,
2527                                          uint8_t *dst,
2528                                          int32_t dst_stride,
2529                                          const int8_t *filter,
2530                                          int32_t height)
2531{
2532    uint32_t loop_cnt;
2533    v8i16 filt0, filt1;
2534    v16i8 src0, src1, src2, src3;
2535    v8i16 in0, in1, in2, in3;
2536    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2537    v16i8 mask1;
2538    v16i8 vec0, vec1, vec2, vec3;
2539    v8i16 dst0, dst1, dst2, dst3;
2540    v8i16 filter_vec, const_vec;
2541
2542    src0_ptr -= 1;
2543
2544    const_vec = __msa_ldi_h(128);
2545    const_vec <<= 6;
2546
2547    filter_vec = LD_SH(filter);
2548    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2549
2550    mask1 = mask0 + 2;
2551
2552    for (loop_cnt = (height >> 2); loop_cnt--;) {
2553        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2554        src0_ptr += (4 * src_stride);
2555        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2556        src1_ptr += (4 * src2_stride);
2557        XORI_B4_128_SB(src0, src1, src2, src3);
2558
2559        dst0 = const_vec;
2560        dst1 = const_vec;
2561        dst2 = const_vec;
2562        dst3 = const_vec;
2563        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2564        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2565        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2566                     dst1, dst2, dst3);
2567        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2568        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2569        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2570                     dst1, dst2, dst3);
2571
2572        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2573                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2574
2575        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2576        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2577        dst += (4 * dst_stride);
2578    }
2579}
2580
2581static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
2582                                 int32_t src_stride,
2583                                 int16_t *src1_ptr,
2584                                 int32_t src2_stride,
2585                                 uint8_t *dst,
2586                                 int32_t dst_stride,
2587                                 const int8_t *filter,
2588                                 int32_t height)
2589{
2590    if (2 == height) {
2591        hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2592                              dst, dst_stride, filter, height);
2593    } else if (6 == height) {
2594        hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2595                              dst, dst_stride, filter, height);
2596    } else if (0 == (height % 4)) {
2597        hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
2598                                      src1_ptr, src2_stride,
2599                                      dst, dst_stride, filter, height);
2600    }
2601}
2602
2603static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
2604                                  int32_t src_stride,
2605                                  int16_t *src1_ptr,
2606                                  int32_t src2_stride,
2607                                  uint8_t *dst,
2608                                  int32_t dst_stride,
2609                                  const int8_t *filter,
2610                                  int32_t height)
2611{
2612    uint32_t loop_cnt;
2613    v8i16 filt0, filt1;
2614    v16i8 src0, src1, src2, src3;
2615    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2616    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2617    v16i8 mask2 = {
2618        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2619    };
2620    v16i8 mask1, mask3;
2621    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2622    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2623    v8i16 filter_vec, const_vec;
2624
2625    src0_ptr -= 1;
2626
2627    const_vec = __msa_ldi_h(128);
2628    const_vec <<= 6;
2629
2630    filter_vec = LD_SH(filter);
2631    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2632
2633    mask1 = mask0 + 2;
2634    mask3 = mask2 + 2;
2635
2636    for (loop_cnt = (height >> 2); loop_cnt--;) {
2637        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2638        src0_ptr += (4 * src_stride);
2639        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2640        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2641        src1_ptr += (4 * src2_stride);
2642
2643        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2644        XORI_B4_128_SB(src0, src1, src2, src3);
2645
2646        dst0 = const_vec;
2647        dst1 = const_vec;
2648        dst2 = const_vec;
2649        dst3 = const_vec;
2650        dst4 = const_vec;
2651        dst5 = const_vec;
2652        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2653        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2654        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2655        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2656                     dst1, dst2, dst3);
2657        DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
2658        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2659        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2660        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5);
2661        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2662                     dst1, dst2, dst3);
2663        DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
2664
2665        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2666                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2667        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2668
2669        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2670        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2671        ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2672        ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
2673        dst += (4 * dst_stride);
2674    }
2675}
2676
2677static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
2678                                  int32_t src_stride,
2679                                  int16_t *src1_ptr,
2680                                  int32_t src2_stride,
2681                                  uint8_t *dst,
2682                                  int32_t dst_stride,
2683                                  const int8_t *filter,
2684                                  int32_t height)
2685{
2686    uint32_t loop_cnt;
2687    v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
2688    v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2689    v8i16 filt0, filt1;
2690    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2691    v16i8 mask1;
2692    v8i16 filter_vec, const_vec;
2693
2694    src0_ptr -= 1;
2695
2696    const_vec = __msa_ldi_h(128);
2697    const_vec <<= 6;
2698
2699    filter_vec = LD_SH(filter);
2700    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2701
2702    mask1 = mask0 + 2;
2703
2704    for (loop_cnt = (height >> 1); loop_cnt--;) {
2705        LD_SB2(src0_ptr, src_stride, src0, src2);
2706        LD_SB2(src0_ptr + 8, src_stride, src1, src3);
2707        src0_ptr += (2 * src_stride);
2708        LD_SH2(src1_ptr, src2_stride, in0, in2);
2709        LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2710        src1_ptr += (2 * src2_stride);
2711
2712        XORI_B4_128_SB(src0, src1, src2, src3);
2713
2714        dst0 = const_vec;
2715        dst1 = const_vec;
2716        dst2 = const_vec;
2717        dst3 = const_vec;
2718
2719        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2720        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2721        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2722                     dst1, dst2, dst3);
2723        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2724        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2725        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2726                     dst1, dst2, dst3);
2727
2728        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2729                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2730
2731        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2732        ST_SH2(dst0, dst1, dst, dst_stride);
2733        dst += (2 * dst_stride);
2734    }
2735}
2736
2737static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
2738                                  int32_t src_stride,
2739                                  int16_t *src1_ptr,
2740                                  int32_t src2_stride,
2741                                  uint8_t *dst,
2742                                  int32_t dst_stride,
2743                                  const int8_t *filter,
2744                                  int32_t height)
2745{
2746    int16_t *src1_ptr_tmp;
2747    uint8_t *dst_tmp;
2748    uint32_t loop_cnt;
2749    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2750    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2751    v8i16 filt0, filt1;
2752    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2753    v16i8 mask1, mask2, mask3;
2754    v16i8 vec0, vec1, vec2, vec3;
2755    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2756    v8i16 filter_vec, const_vec;
2757
2758    src0_ptr -= 1;
2759
2760    const_vec = __msa_ldi_h(128);
2761    const_vec <<= 6;
2762
2763    filter_vec = LD_SH(filter);
2764    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2765
2766    mask1 = mask0 + 2;
2767    mask2 = mask0 + 8;
2768    mask3 = mask0 + 10;
2769
2770    dst_tmp = dst + 16;
2771    src1_ptr_tmp = src1_ptr + 16;
2772
2773    for (loop_cnt = (height >> 2); loop_cnt--;) {
2774        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2775        LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2776        src0_ptr += (4 * src_stride);
2777        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2778        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2779        src1_ptr += (4 * src2_stride);
2780        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2781
2782        dst0 = const_vec;
2783        dst1 = const_vec;
2784        dst2 = const_vec;
2785        dst3 = const_vec;
2786        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2787        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
2788        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2789                     dst1, dst2, dst3);
2790        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2791        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
2792        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2793                     dst1, dst2, dst3);
2794
2795        dst4 = const_vec;
2796        dst5 = const_vec;
2797        dst6 = const_vec;
2798        dst7 = const_vec;
2799        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2800        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2801        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2802                     dst5, dst6, dst7);
2803        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2804        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2805        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2806                     dst5, dst6, dst7);
2807
2808        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2809                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2810        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2811                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2812
2813        PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2814                    dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2815        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2816        dst += (4 * dst_stride);
2817
2818        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2819        src1_ptr_tmp += (4 * src2_stride);
2820
2821        dst0 = const_vec;
2822        dst1 = const_vec;
2823        dst2 = const_vec;
2824        dst3 = const_vec;
2825        VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2826        VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2827        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2828                     dst1, dst2, dst3);
2829        VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1);
2830        VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2831        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2832                     dst1, dst2, dst3);
2833
2834        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2835                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2836
2837        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2838        ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
2839        dst_tmp += (4 * dst_stride);
2840    }
2841}
2842
2843static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
2844                                  int32_t src_stride,
2845                                  int16_t *src1_ptr,
2846                                  int32_t src2_stride,
2847                                  uint8_t *dst,
2848                                  int32_t dst_stride,
2849                                  const int8_t *filter,
2850                                  int32_t height)
2851{
2852    uint32_t loop_cnt;
2853    v16i8 src0, src1, src2;
2854    v8i16 in0, in1, in2, in3;
2855    v8i16 filt0, filt1;
2856    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2857    v16i8 mask1, mask2, mask3;
2858    v8i16 dst0, dst1, dst2, dst3;
2859    v16i8 vec0, vec1, vec2, vec3;
2860    v8i16 filter_vec, const_vec;
2861
2862    src0_ptr -= 1;
2863
2864    const_vec = __msa_ldi_h(128);
2865    const_vec <<= 6;
2866
2867    filter_vec = LD_SH(filter);
2868    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2869
2870    mask1 = mask0 + 2;
2871    mask2 = mask0 + 8;
2872    mask3 = mask0 + 10;
2873
2874    for (loop_cnt = height; loop_cnt--;) {
2875        LD_SB2(src0_ptr, 16, src0, src1);
2876        src2 = LD_SB(src0_ptr + 24);
2877        src0_ptr += src_stride;
2878        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2879        src1_ptr += src2_stride;
2880        XORI_B3_128_SB(src0, src1, src2);
2881
2882        dst0 = const_vec;
2883        dst1 = const_vec;
2884        dst2 = const_vec;
2885        dst3 = const_vec;
2886        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2887        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2888        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2889                     dst1, dst2, dst3);
2890        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2891        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2892        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2893                     dst1, dst2, dst3);
2894
2895        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2896                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2897
2898        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2899        ST_SH2(dst0, dst1, dst, 16);
2900        dst += dst_stride;
2901    }
2902}
2903
2904static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
2905                                  int32_t src_stride,
2906                                  int16_t *src1_ptr,
2907                                  int32_t src2_stride,
2908                                  uint8_t *dst,
2909                                  int32_t dst_stride,
2910                                  const int8_t *filter,
2911                                  int32_t height)
2912{
2913    v16i8 src0, src1, src2, src3, src4;
2914    v8i16 in0, in1;
2915    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2916    v8i16 dst10;
2917    v8i16 filt0, filt1;
2918    v8i16 filter_vec, const_vec;
2919
2920    src0_ptr -= src_stride;
2921
2922    const_vec = __msa_ldi_h(128);
2923    const_vec <<= 6;
2924
2925    filter_vec = LD_SH(filter);
2926    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2927
2928    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2929    src0_ptr += (3 * src_stride);
2930
2931    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2932    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2933    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2934
2935    LD_SB2(src0_ptr, src_stride, src3, src4);
2936    LD_SH2(src1_ptr, src2_stride, in0, in1);
2937    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2938    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2939    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2940    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2941
2942    dst10 = const_vec;
2943    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2944    dst10 = __msa_adds_s_h(dst10, in0);
2945    dst10 = __msa_srari_h(dst10, 7);
2946    CLIP_SH_0_255(dst10);
2947
2948    dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2949    ST_W2(dst10, 0, 1, dst, dst_stride);
2950}
2951
2952static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
2953                                  int32_t src_stride,
2954                                  int16_t *src1_ptr,
2955                                  int32_t src2_stride,
2956                                  uint8_t *dst,
2957                                  int32_t dst_stride,
2958                                  const int8_t *filter,
2959                                  int32_t height)
2960{
2961    v16i8 src0, src1, src2, src3, src4, src5, src6;
2962    v8i16 in0, in1, in2, in3;
2963    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2964    v16i8 src2110, src4332, src6554;
2965    v8i16 dst10, dst32;
2966    v8i16 filt0, filt1;
2967    v8i16 filter_vec, const_vec;
2968
2969    src0_ptr -= src_stride;
2970
2971    const_vec = __msa_ldi_h(128);
2972    const_vec <<= 6;
2973
2974    filter_vec = LD_SH(filter);
2975    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2976
2977    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2978    src0_ptr += (3 * src_stride);
2979    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2980    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2981    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2982
2983    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2984    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2985    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2986    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2987               src32_r, src43_r, src54_r, src65_r);
2988    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2989    XORI_B2_128_SB(src4332, src6554);
2990
2991    dst10 = const_vec;
2992    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2993    dst32 = const_vec;
2994    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2995    HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
2996
2997    dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2998    ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
2999}
3000
3001static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
3002                                          int32_t src_stride,
3003                                          int16_t *src1_ptr,
3004                                          int32_t src2_stride,
3005                                          uint8_t *dst,
3006                                          int32_t dst_stride,
3007                                          const int8_t *filter,
3008                                          int32_t height)
3009{
3010    int32_t loop_cnt;
3011    v16i8 src0, src1, src2, src3, src4, src5;
3012    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3013    v16i8 src6, src7, src8, src9;
3014    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3015    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3016    v16i8 src2110, src4332, src6554, src8776;
3017    v8i16 dst10, dst32, dst54, dst76;
3018    v8i16 filt0, filt1;
3019    v8i16 filter_vec, const_vec;
3020
3021    src0_ptr -= src_stride;
3022
3023    const_vec = __msa_ldi_h(128);
3024    const_vec <<= 6;
3025
3026    filter_vec = LD_SH(filter);
3027    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3028
3029    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3030    src0_ptr += (3 * src_stride);
3031    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3032    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3033    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3034
3035    for (loop_cnt = (height >> 3); loop_cnt--;) {
3036        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3037        src0_ptr += (6 * src_stride);
3038        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3039        src1_ptr += (8 * src2_stride);
3040        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3041        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3042        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3043                   src32_r, src43_r, src54_r, src65_r);
3044        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3045        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3046                   src4332, src6554, src8776);
3047        XORI_B3_128_SB(src4332, src6554, src8776);
3048
3049        dst10 = const_vec;
3050        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3051        dst32 = const_vec;
3052        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3053        dst54 = const_vec;
3054        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3055
3056        LD_SB2(src0_ptr, src_stride, src9, src2);
3057        src0_ptr += (2 * src_stride);
3058        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3059        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3060        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3061        dst76 = const_vec;
3062        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3063
3064        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3065                          dst10, dst32, dst54, dst76, 7,
3066                          dst10, dst32, dst54, dst76);
3067
3068        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3069        ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3070        dst += (8 * dst_stride);
3071    }
3072}
3073
3074static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
3075                                 int32_t src_stride,
3076                                 int16_t *src1_ptr,
3077                                 int32_t src2_stride,
3078                                 uint8_t *dst,
3079                                 int32_t dst_stride,
3080                                 const int8_t *filter,
3081                                 int32_t height)
3082{
3083    if (2 == height) {
3084        hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3085                              dst, dst_stride, filter, height);
3086    } else if (4 == height) {
3087        hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3088                              dst, dst_stride, filter, height);
3089    } else {
3090        hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
3091                                      src1_ptr, src2_stride,
3092                                      dst, dst_stride, filter, height);
3093    }
3094}
3095
3096static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
3097                                 int32_t src_stride,
3098                                 int16_t *src1_ptr,
3099                                 int32_t src2_stride,
3100                                 uint8_t *dst,
3101                                 int32_t dst_stride,
3102                                 const int8_t *filter,
3103                                 int32_t height)
3104{
3105    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3106    v8i16 in0, in1, in2, in3;
3107    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3108    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3109    v8i16 filt0, filt1;
3110    v8i16 filter_vec, const_vec;
3111
3112    src0_ptr -= src_stride;
3113
3114    const_vec = __msa_ldi_h(128);
3115    const_vec <<= 6;
3116
3117    filter_vec = LD_SH(filter);
3118    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3119
3120    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3121    src0_ptr += (3 * src_stride);
3122    LD_SB2(src0_ptr, src_stride, src3, src4);
3123    src0_ptr += (2 * src_stride);
3124    LD_SB2(src0_ptr, src_stride, src5, src6);
3125    src0_ptr += (2 * src_stride);
3126    LD_SB2(src0_ptr, src_stride, src7, src8);
3127    src0_ptr += (2 * src_stride);
3128    LD_SB2(src0_ptr, src_stride, src9, src10);
3129    src0_ptr += (2 * src_stride);
3130
3131    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3132    src1_ptr += (4 * src2_stride);
3133
3134    XORI_B3_128_SB(src0, src1, src2);
3135    XORI_B2_128_SB(src3, src4);
3136    XORI_B2_128_SB(src5, src6);
3137    XORI_B2_128_SB(src7, src8);
3138    XORI_B2_128_SB(src9, src10);
3139
3140    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3141    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3142
3143    dst0_r = const_vec;
3144    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3145    dst1_r = const_vec;
3146    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3147
3148    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3149
3150    dst2_r = const_vec;
3151    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3152    dst3_r = const_vec;
3153    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3154
3155    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3156                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
3157                      dst0_r, dst1_r, dst2_r, dst3_r);
3158
3159    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3160    ST_W2(dst0_r, 0, 2, dst, dst_stride);
3161    ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3162    ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3163    ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3164    dst += (4 * dst_stride);
3165
3166    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3167    src1_ptr += (4 * src2_stride);
3168    ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3169
3170    dst0_r = const_vec;
3171    DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3172    dst1_r = const_vec;
3173    DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3174
3175    ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3176
3177    dst2_r = const_vec;
3178    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3179    dst3_r = const_vec;
3180    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3181
3182    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3183                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
3184                      dst0_r, dst1_r, dst2_r, dst3_r);
3185
3186    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3187    ST_W2(dst0_r, 0, 2, dst, dst_stride);
3188    ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3189    ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3190    ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3191    dst += (4 * dst_stride);
3192}
3193
3194static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
3195                                  int32_t src_stride,
3196                                  int16_t *src1_ptr,
3197                                  int32_t src2_stride,
3198                                  uint8_t *dst,
3199                                  int32_t dst_stride,
3200                                  const int8_t *filter,
3201                                  int32_t height)
3202{
3203    v16i8 src0, src1, src2, src3, src4;
3204    v8i16 in0, in1, dst0_r, dst1_r;
3205    v16i8 src10_r, src32_r, src21_r, src43_r;
3206    v8i16 filt0, filt1;
3207    v8i16 filter_vec, const_vec;
3208
3209    src0_ptr -= src_stride;
3210
3211    const_vec = __msa_ldi_h(128);
3212    const_vec <<= 6;
3213
3214    filter_vec = LD_SH(filter);
3215    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3216
3217    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3218    src0_ptr += (3 * src_stride);
3219    XORI_B3_128_SB(src0, src1, src2);
3220    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3221
3222    LD_SB2(src0_ptr, src_stride, src3, src4);
3223    LD_SH2(src1_ptr, src2_stride, in0, in1);
3224    XORI_B2_128_SB(src3, src4);
3225    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3226
3227    dst0_r = const_vec;
3228    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3229    dst1_r = const_vec;
3230    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3231
3232    HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3233    dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3234
3235    ST_D2(dst0_r, 0, 1, dst, dst_stride);
3236}
3237
3238static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
3239                                  int32_t src_stride,
3240                                  int16_t *src1_ptr,
3241                                  int32_t src2_stride,
3242                                  uint8_t *dst,
3243                                  int32_t dst_stride,
3244                                  const int8_t *filter,
3245                                  int32_t height)
3246{
3247    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3248    v8i16 in0, in1, in2, in3, in4, in5;
3249    v16i8 src10_r, src32_r, src54_r, src76_r;
3250    v16i8 src21_r, src43_r, src65_r, src87_r;
3251    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3252    v8i16 filt0, filt1;
3253    v8i16 filter_vec, const_vec;
3254
3255    src0_ptr -= src_stride;
3256
3257    const_vec = __msa_ldi_h(128);
3258    const_vec <<= 6;
3259
3260    filter_vec = LD_SH(filter);
3261    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3262
3263    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3264    src0_ptr += (3 * src_stride);
3265    XORI_B3_128_SB(src0, src1, src2);
3266    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3267
3268    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3269    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3270    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3271    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3272               src32_r, src43_r, src54_r, src65_r);
3273    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3274
3275    dst0_r = const_vec;
3276    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3277    dst1_r = const_vec;
3278    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3279    dst2_r = const_vec;
3280    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3281    dst3_r = const_vec;
3282    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3283    dst4_r = const_vec;
3284    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3285    dst5_r = const_vec;
3286    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3287    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3288                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
3289                      dst0_r, dst1_r, dst2_r, dst3_r);
3290    HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
3291
3292    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3293    dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3294    ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3295    ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
3296}
3297
3298static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
3299                                          int32_t src_stride,
3300                                          int16_t *src1_ptr,
3301                                          int32_t src2_stride,
3302                                          uint8_t *dst,
3303                                          int32_t dst_stride,
3304                                          const int8_t *filter,
3305                                          int32_t height)
3306{
3307    int32_t loop_cnt;
3308    v16i8 src0, src1, src2, src3, src4, src5;
3309    v8i16 in0, in1, in2, in3;
3310    v16i8 src10_r, src32_r, src21_r, src43_r;
3311    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3312    v8i16 filt0, filt1;
3313    v8i16 filter_vec, const_vec;
3314
3315    src0_ptr -= src_stride;
3316
3317    const_vec = __msa_ldi_h(128);
3318    const_vec <<= 6;
3319
3320    filter_vec = LD_SH(filter);
3321    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3322
3323    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3324    src0_ptr += (3 * src_stride);
3325    XORI_B3_128_SB(src0, src1, src2);
3326    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3327
3328    for (loop_cnt = (height >> 2); loop_cnt--;) {
3329        LD_SB2(src0_ptr, src_stride, src3, src4);
3330        src0_ptr += (2 * src_stride);
3331        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3332        src1_ptr += (4 * src2_stride);
3333        XORI_B2_128_SB(src3, src4);
3334        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3335
3336        dst0_r = const_vec;
3337        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3338        dst1_r = const_vec;
3339        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3340
3341        LD_SB2(src0_ptr, src_stride, src5, src2);
3342        src0_ptr += (2 * src_stride);
3343        XORI_B2_128_SB(src5, src2);
3344        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3345
3346        dst2_r = const_vec;
3347        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3348        dst3_r = const_vec;
3349        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3350        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3351                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
3352                          dst0_r, dst1_r, dst2_r, dst3_r);
3353
3354        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3355        ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3356        dst += (4 * dst_stride);
3357    }
3358}
3359
3360static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
3361                                 int32_t src_stride,
3362                                 int16_t *src1_ptr,
3363                                 int32_t src2_stride,
3364                                 uint8_t *dst,
3365                                 int32_t dst_stride,
3366                                 const int8_t *filter,
3367                                 int32_t height)
3368{
3369    if (2 == height) {
3370        hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3371                              dst, dst_stride, filter, height);
3372    } else if (6 == height) {
3373        hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3374                              dst, dst_stride, filter, height);
3375    } else {
3376        hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
3377                                      src1_ptr, src2_stride,
3378                                      dst, dst_stride, filter, height);
3379    }
3380}
3381
3382static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
3383                                  int32_t src_stride,
3384                                  int16_t *src1_ptr,
3385                                  int32_t src2_stride,
3386                                  uint8_t *dst,
3387                                  int32_t dst_stride,
3388                                  const int8_t *filter,
3389                                  int32_t height)
3390{
3391    int32_t loop_cnt;
3392    v16i8 src0, src1, src2, src3, src4, src5, src6;
3393    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3394    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3395    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3396    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3397    v16i8 src2110, src4332, src6554;
3398    v8i16 dst0_l, dst1_l, filt0, filt1;
3399    v8i16 filter_vec, const_vec;
3400
3401    src0_ptr -= (1 * src_stride);
3402
3403    const_vec = __msa_ldi_h(128);
3404    const_vec <<= 6;
3405
3406    filter_vec = LD_SH(filter);
3407    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3408
3409    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3410    src0_ptr += (3 * src_stride);
3411    XORI_B3_128_SB(src0, src1, src2);
3412    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3413    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3414    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3415
3416    for (loop_cnt = (height >> 2); loop_cnt--;) {
3417        LD_SB2(src0_ptr, src_stride, src3, src4);
3418        src0_ptr += (2 * src_stride);
3419        LD_SB2(src0_ptr, src_stride, src5, src6);
3420        src0_ptr += (2 * src_stride);
3421        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3422        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3423        src1_ptr += (4 * src2_stride);
3424        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3425        XORI_B2_128_SB(src3, src4);
3426        XORI_B2_128_SB(src5, src6);
3427
3428        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3429        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3430        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3431        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3432        ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3433        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3434
3435        dst0_r = const_vec;
3436        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3437        dst1_r = const_vec;
3438        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3439        dst0_l = const_vec;
3440        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3441        dst2_r = const_vec;
3442        DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3443        dst3_r = const_vec;
3444        DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3445        dst1_l = const_vec;
3446        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3447        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3448                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
3449                          dst0_r, dst1_r, dst2_r, dst3_r);
3450        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
3451
3452        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3453        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3454        ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3455        ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
3456        dst += (4 * dst_stride);
3457
3458        src2 = src6;
3459        src10_r = src54_r;
3460        src21_r = src65_r;
3461        src2110 = src6554;
3462    }
3463}
3464
3465static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
3466                                  int32_t src_stride,
3467                                  int16_t *src1_ptr,
3468                                  int32_t src2_stride,
3469                                  uint8_t *dst,
3470                                  int32_t dst_stride,
3471                                  const int8_t *filter,
3472                                  int32_t height)
3473{
3474    int32_t loop_cnt;
3475    v16i8 src0, src1, src2, src3, src4, src5;
3476    v8i16 in0, in1, in2, in3;
3477    v16i8 src10_r, src32_r, src21_r, src43_r;
3478    v16i8 src10_l, src32_l, src21_l, src43_l;
3479    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3480    v8i16 filt0, filt1;
3481    v8i16 filter_vec, const_vec;
3482
3483    src0_ptr -= src_stride;
3484
3485    const_vec = __msa_ldi_h(128);
3486    const_vec <<= 6;
3487
3488    filter_vec = LD_SH(filter);
3489    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3490
3491    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3492    src0_ptr += (3 * src_stride);
3493    XORI_B3_128_SB(src0, src1, src2);
3494    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3495    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3496
3497    for (loop_cnt = (height >> 2); loop_cnt--;) {
3498        LD_SB2(src0_ptr, src_stride, src3, src4);
3499        src0_ptr += (2 * src_stride);
3500        LD_SH2(src1_ptr, src2_stride, in0, in1);
3501        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3502        src1_ptr += (2 * src2_stride);
3503        XORI_B2_128_SB(src3, src4);
3504        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3505        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3506
3507        dst0_r = const_vec;
3508        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3509        dst1_r = const_vec;
3510        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3511        dst0_l = const_vec;
3512        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3513        dst1_l = const_vec;
3514        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3515        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3516                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3517                          dst0_r, dst1_r, dst0_l, dst1_l);
3518
3519        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3520        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3521        dst += (2 * dst_stride);
3522
3523        LD_SB2(src0_ptr, src_stride, src5, src2);
3524        src0_ptr += (2 * src_stride);
3525        LD_SH2(src1_ptr, src2_stride, in0, in1);
3526        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3527        src1_ptr += (2 * src2_stride);
3528        XORI_B2_128_SB(src5, src2);
3529        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3530        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3531
3532        dst0_r = const_vec;
3533        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3534        dst0_l = const_vec;
3535        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3536        dst1_r = const_vec;
3537        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3538        dst1_l = const_vec;
3539        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3540        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3541                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3542                          dst0_r, dst1_r, dst0_l, dst1_l);
3543
3544        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3545        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3546        dst += (2 * dst_stride);
3547    }
3548}
3549
3550static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
3551                                  int32_t src_stride,
3552                                  int16_t *src1_ptr,
3553                                  int32_t src2_stride,
3554                                  uint8_t *dst,
3555                                  int32_t dst_stride,
3556                                  const int8_t *filter,
3557                                  int32_t height)
3558{
3559    uint32_t loop_cnt;
3560    v16i8 src0, src1, src2, src3, src4, src5;
3561    v16i8 src6, src7, src8, src9, src10, src11;
3562    v8i16 in0, in1, in2, in3, in4, in5;
3563    v16i8 src10_r, src32_r, src76_r, src98_r;
3564    v16i8 src21_r, src43_r, src87_r, src109_r;
3565    v16i8 src10_l, src32_l, src21_l, src43_l;
3566    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3567    v8i16 dst0_l, dst1_l;
3568    v8i16 filt0, filt1;
3569    v8i16 filter_vec, const_vec;
3570
3571    src0_ptr -= src_stride;
3572
3573    const_vec = __msa_ldi_h(128);
3574    const_vec <<= 6;
3575
3576    filter_vec = LD_SH(filter);
3577    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3578
3579    /* 16width */
3580    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3581    XORI_B3_128_SB(src0, src1, src2);
3582    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3583    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3584    /* 8width */
3585    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3586    src0_ptr += (3 * src_stride);
3587    XORI_B3_128_SB(src6, src7, src8);
3588    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3589
3590    for (loop_cnt = (height >> 2); loop_cnt--;) {
3591        /* 16width */
3592        LD_SB2(src0_ptr, src_stride, src3, src4);
3593        LD_SH2(src1_ptr, src2_stride, in0, in1);
3594        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3595        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3596        src1_ptr += (2 * src2_stride);
3597        XORI_B2_128_SB(src3, src4);
3598        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3599        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3600        /* 8width */
3601        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3602        src0_ptr += (2 * src_stride);
3603        XORI_B2_128_SB(src9, src10);
3604        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3605        /* 16width */
3606        dst0_r = const_vec;
3607        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3608        dst0_l = const_vec;
3609        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3610        dst1_r = const_vec;
3611        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3612        dst1_l = const_vec;
3613        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3614        /* 8width */
3615        dst2_r = const_vec;
3616        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3617        dst3_r = const_vec;
3618        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3619        /* 16width */
3620        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3621                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3622                          dst0_r, dst1_r, dst0_l, dst1_l);
3623
3624        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3625
3626        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3627        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3628        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3629        ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3630        dst += (2 * dst_stride);
3631
3632        /* 16width */
3633        LD_SB2(src0_ptr, src_stride, src5, src2);
3634        LD_SH2(src1_ptr, src2_stride, in0, in1);
3635        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3636        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3637        src1_ptr += (2 * src2_stride);
3638        XORI_B2_128_SB(src5, src2);
3639        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3640        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3641        /* 8width */
3642        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3643        src0_ptr += (2 * src_stride);
3644        XORI_B2_128_SB(src11, src8);
3645        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3646        /* 16width */
3647        dst0_r = const_vec;
3648        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3649        dst0_l = const_vec;
3650        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3651        dst1_r = const_vec;
3652        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3653        dst1_l = const_vec;
3654        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3655        /* 8width */
3656        dst2_r = const_vec;
3657        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3658        dst3_r = const_vec;
3659        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3660
3661        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3662                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3663                          dst0_r, dst1_r, dst0_l, dst1_l);
3664        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3665
3666        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3667        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3668        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3669        ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3670        dst += (2 * dst_stride);
3671    }
3672}
3673
3674static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
3675                                  int32_t src_stride,
3676                                  int16_t *src1_ptr,
3677                                  int32_t src2_stride,
3678                                  uint8_t *dst,
3679                                  int32_t dst_stride,
3680                                  const int8_t *filter,
3681                                  int32_t height)
3682{
3683    uint32_t loop_cnt;
3684    uint8_t *dst_tmp = dst + 16;
3685    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3686    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3687    v16i8 src10_r, src32_r, src76_r, src98_r;
3688    v16i8 src21_r, src43_r, src87_r, src109_r;
3689    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3690    v16i8 src10_l, src32_l, src76_l, src98_l;
3691    v16i8 src21_l, src43_l, src87_l, src109_l;
3692    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3693    v8i16 filt0, filt1;
3694    v8i16 filter_vec, const_vec;
3695
3696    src0_ptr -= src_stride;
3697
3698    const_vec = __msa_ldi_h(128);
3699    const_vec <<= 6;
3700
3701    filter_vec = LD_SH(filter);
3702    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3703
3704    /* 16width */
3705    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3706    XORI_B3_128_SB(src0, src1, src2);
3707    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3708    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3709
3710    /* next 16width */
3711    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3712    src0_ptr += (3 * src_stride);
3713    XORI_B3_128_SB(src6, src7, src8);
3714    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3715    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3716
3717    for (loop_cnt = (height >> 1); loop_cnt--;) {
3718        /* 16width */
3719        LD_SB2(src0_ptr, src_stride, src3, src4);
3720        LD_SH2(src1_ptr, src2_stride, in0, in1);
3721        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3722        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3723        LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3724        src1_ptr += (2 * src2_stride);
3725        XORI_B2_128_SB(src3, src4);
3726        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3727        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3728        /* 16width */
3729        dst0_r = const_vec;
3730        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3731        dst0_l = const_vec;
3732        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3733        dst1_r = const_vec;
3734        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3735        dst1_l = const_vec;
3736        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3737        /* 16width */
3738        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3739                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
3740                          dst0_r, dst1_r, dst0_l, dst1_l);
3741
3742        src10_r = src32_r;
3743        src21_r = src43_r;
3744        src10_l = src32_l;
3745        src21_l = src43_l;
3746        src2 = src4;
3747
3748        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3749        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3750        dst += (2 * dst_stride);
3751
3752        /* next 16width */
3753        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3754        src0_ptr += (2 * src_stride);
3755        XORI_B2_128_SB(src9, src10);
3756        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3758        /* next 16width */
3759        dst2_r = const_vec;
3760        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3761        dst2_l = const_vec;
3762        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3763        dst3_r = const_vec;
3764        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3765        dst3_l = const_vec;
3766        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3767        /* next 16width */
3768        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
3769                          dst2_r, dst3_r, dst2_l, dst3_l, 7,
3770                          dst2_r, dst3_r, dst2_l, dst3_l);
3771
3772        PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3773        ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3774        dst_tmp += (2 * dst_stride);
3775
3776        src76_r = src98_r;
3777        src87_r = src109_r;
3778        src76_l = src98_l;
3779        src87_l = src109_l;
3780        src8 = src10;
3781    }
3782}
3783
3784static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
3785                                  int32_t src_stride,
3786                                  int16_t *src1_ptr,
3787                                  int32_t src2_stride,
3788                                  uint8_t *dst,
3789                                  int32_t dst_stride,
3790                                  const int8_t *filter_x,
3791                                  const int8_t *filter_y)
3792{
3793    uint64_t tp0, tp1;
3794    v16u8 out;
3795    v8i16 in0 = { 0 };
3796    v16i8 src0, src1, src2, src3, src4;
3797    v8i16 filt0, filt1;
3798    v8i16 filt_h0, filt_h1;
3799    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3800    v16i8 mask1;
3801    v8i16 filter_vec, const_vec;
3802    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3803    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp;
3804    v4i32 dst0, dst1;
3805
3806    src0_ptr -= (src_stride + 1);
3807
3808    filter_vec = LD_SH(filter_x);
3809    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3810
3811    filter_vec = LD_SH(filter_y);
3812    UNPCK_R_SB_SH(filter_vec, filter_vec);
3813
3814    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3815
3816    mask1 = mask0 + 2;
3817
3818    const_vec = __msa_ldi_h(128);
3819    const_vec <<= 6;
3820
3821    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
3822    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3823
3824    LD2(src1_ptr, src2_stride, tp0, tp1);
3825    INSERT_D2_SH(tp0, tp1, in0);
3826    in0 = __msa_adds_s_h(in0, const_vec);
3827
3828    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3829    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3830    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3831
3832    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3833    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3834    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3835
3836    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3837    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3838
3839    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3840    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3841    dst0 >>= 6;
3842    dst1 >>= 6;
3843    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3844    tmp = __msa_adds_s_h(tmp, in0);
3845    tmp = __msa_srari_h(tmp, 7);
3846    CLIP_SH_0_255(tmp);
3847    out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
3848    ST_W2(out, 0, 1, dst, dst_stride);
3849}
3850
3851static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
3852                                  int32_t src_stride,
3853                                  int16_t *src1_ptr,
3854                                  int32_t src2_stride,
3855                                  uint8_t *dst,
3856                                  int32_t dst_stride,
3857                                  const int8_t *filter_x,
3858                                  const int8_t *filter_y)
3859{
3860    uint64_t tp0, tp1;
3861    v16u8 out;
3862    v16i8 src0, src1, src2, src3, src4, src5, src6;
3863    v8i16 filt0, filt1;
3864    v8i16 filt_h0, filt_h1;
3865    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3866    v16i8 mask1;
3867    v8i16 filter_vec, const_vec;
3868    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3869    v8i16 tmp0, tmp1;
3870    v8i16 in0 = { 0 }, in1 = { 0 };
3871    v8i16 dst30, dst41, dst52, dst63;
3872    v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3873    v4i32 dst0, dst1, dst2, dst3;
3874
3875    src0_ptr -= (src_stride + 1);
3876
3877    filter_vec = LD_SH(filter_x);
3878    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3879
3880    filter_vec = LD_SH(filter_y);
3881    UNPCK_R_SB_SH(filter_vec, filter_vec);
3882
3883    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3884
3885    mask1 = mask0 + 2;
3886
3887    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
3888    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3889
3890    const_vec = __msa_ldi_h(128);
3891    const_vec <<= 6;
3892
3893    LD2(src1_ptr, src2_stride, tp0, tp1);
3894    src1_ptr += 2 * src2_stride;
3895    INSERT_D2_SH(tp0, tp1, in0);
3896    LD2(src1_ptr, src2_stride, tp0, tp1);
3897    INSERT_D2_SH(tp0, tp1, in1);
3898
3899    ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
3900
3901    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3902    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3903    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3904    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3905
3906    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3907    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3908    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3909    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3910
3911    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3912    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3913    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3914    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3915    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3916    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3917    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3918    SRA_4V(dst0, dst1, dst2, dst3, 6);
3919    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3920    ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1);
3921    SRARI_H2_SH(tmp0, tmp1, 7);
3922    CLIP_SH2_0_255(tmp0, tmp1);
3923    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3924    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3925}
3926
3927static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
3928                                          int32_t src_stride,
3929                                          int16_t *src1_ptr,
3930                                          int32_t src2_stride,
3931                                          uint8_t *dst,
3932                                          int32_t dst_stride,
3933                                          const int8_t *filter_x,
3934                                          const int8_t *filter_y,
3935                                          int32_t height)
3936{
3937    uint32_t loop_cnt;
3938    uint64_t tp0, tp1;
3939    v16u8 out0, out1;
3940    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3941    v8i16 filt0, filt1;
3942    v8i16 filt_h0, filt_h1;
3943    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3944    v16i8 mask1;
3945    v8i16 filter_vec, const_vec;
3946    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3947    v8i16 tmp0, tmp1, tmp2, tmp3;
3948    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3949    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3950    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3951    v8i16 dst98_r, dst109_r;
3952    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
3953    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3954
3955    src0_ptr -= (src_stride + 1);
3956
3957    filter_vec = LD_SH(filter_x);
3958    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3959
3960    filter_vec = LD_SH(filter_y);
3961    UNPCK_R_SB_SH(filter_vec, filter_vec);
3962
3963    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3964
3965    mask1 = mask0 + 2;
3966
3967    const_vec = __msa_ldi_h(128);
3968    const_vec <<= 6;
3969
3970    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3971    src0_ptr += (3 * src_stride);
3972    XORI_B3_128_SB(src0, src1, src2);
3973
3974    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3975    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3976    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3977    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3978    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3979    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3980
3981
3982    for (loop_cnt = height >> 3; loop_cnt--;) {
3983        LD_SB8(src0_ptr, src_stride,
3984               src3, src4, src5, src6, src7, src8, src9, src10);
3985        src0_ptr += (8 * src_stride);
3986        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3987        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3988        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3989        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3990        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3991
3992        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3993        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3994        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3995        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3996
3997        dst32_r = __msa_ilvr_h(dst73, dst22);
3998        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3999        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4000        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4001        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4002        dst76_r = __msa_ilvr_h(dst22, dst106);
4003
4004        LD2(src1_ptr, src2_stride, tp0, tp1);
4005        src1_ptr += 2 * src2_stride;
4006        INSERT_D2_SH(tp0, tp1, in0);
4007        LD2(src1_ptr, src2_stride, tp0, tp1);
4008        src1_ptr += 2 * src2_stride;
4009        INSERT_D2_SH(tp0, tp1, in1);
4010
4011        LD2(src1_ptr, src2_stride, tp0, tp1);
4012        src1_ptr += 2 * src2_stride;
4013        INSERT_D2_SH(tp0, tp1, in2);
4014        LD2(src1_ptr, src2_stride, tp0, tp1);
4015        src1_ptr += 2 * src2_stride;
4016        INSERT_D2_SH(tp0, tp1, in3);
4017
4018        ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4019                    const_vec, in0, in1, in2, in3);
4020        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4021        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4022        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4023        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4024        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4025        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4026        dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4027        dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4028        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4029        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4030        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
4031                    dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4032        ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4033                    tmp2, tmp3);
4034        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4035        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4036        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4037        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4038        dst += (8 * dst_stride);
4039
4040        dst10_r = dst98_r;
4041        dst21_r = dst109_r;
4042        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4043    }
4044}
4045
4046static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
4047                                 int32_t src_stride,
4048                                 int16_t *src1_ptr,
4049                                 int32_t src2_stride,
4050                                 uint8_t *dst,
4051                                 int32_t dst_stride,
4052                                 const int8_t *filter_x,
4053                                 const int8_t *filter_y,
4054                                 int32_t height)
4055{
4056    if (2 == height) {
4057        hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4058                              dst, dst_stride, filter_x, filter_y);
4059    } else if (4 == height) {
4060        hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4061                              dst, dst_stride, filter_x, filter_y);
4062    } else if (0 == (height % 8)) {
4063        hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
4064                                      src1_ptr, src2_stride,
4065                                      dst, dst_stride,
4066                                      filter_x, filter_y, height);
4067    }
4068}
4069
4070static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
4071                                 int32_t src_stride,
4072                                 int16_t *src1_ptr,
4073                                 int32_t src2_stride,
4074                                 uint8_t *dst,
4075                                 int32_t dst_stride,
4076                                 const int8_t *filter_x,
4077                                 const int8_t *filter_y,
4078                                 int32_t height)
4079{
4080    uint32_t tpw0, tpw1, tpw2, tpw3;
4081    uint64_t tp0, tp1;
4082    v16u8 out0, out1, out2;
4083    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4084    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4085    v8i16 filt0, filt1;
4086    v8i16 filt_h0, filt_h1;
4087    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4088    v16i8 mask1;
4089    v8i16 filter_vec, const_vec;
4090    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4091    v8i16 dsth10, tmp4, tmp5;
4092    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4093    v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4094    v8i16 tmp0, tmp1, tmp2, tmp3;
4095    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4096    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4097    v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4098    v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4099    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4100    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4101    v8i16 in4 = { 0 }, in5 = { 0 };
4102
4103    src0_ptr -= (src_stride + 1);
4104
4105    filter_vec = LD_SH(filter_x);
4106    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4107
4108    filter_vec = LD_SH(filter_y);
4109    UNPCK_R_SB_SH(filter_vec, filter_vec);
4110
4111    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4112
4113    mask1 = mask0 + 2;
4114
4115    const_vec = __msa_ldi_h(128);
4116    const_vec <<= 6;
4117
4118    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4119    src0_ptr += (3 * src_stride);
4120    XORI_B3_128_SB(src0, src1, src2);
4121
4122    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4123    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4124    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4125
4126    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4127    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4128    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4129
4130    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4131    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4132
4133    LD_SB8(src0_ptr, src_stride,
4134           src3, src4, src5, src6, src7, src8, src9, src10);
4135    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4136
4137    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4138    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4139    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4140    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4141
4142    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4143    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4144    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4145    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4146
4147    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4148    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4149    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4150    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4151
4152    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4153    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4154    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4155    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4156
4157    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4158    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4159    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4160    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4161    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4162    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4163    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4164    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4165    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4166    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4167    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4168
4169    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4170    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4171    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4172    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4173    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4174    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4175    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4176    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4177    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4178    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4179    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4180    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4181    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4182    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4183    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4184    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4185    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4186    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4187
4188    LD2(src1_ptr, src2_stride, tp0, tp1);
4189    INSERT_D2_SH(tp0, tp1, in0);
4190    LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4191    INSERT_D2_SH(tp0, tp1, in1);
4192
4193    LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4194    INSERT_D2_SH(tp0, tp1, in2);
4195    LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4196    INSERT_D2_SH(tp0, tp1, in3);
4197
4198    ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4199                in0, in1, in2, in3);
4200    ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4201                tmp3);
4202    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4203    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4204    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4205    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4206
4207    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4208    src1_ptr += (4 * src2_stride);
4209    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
4210    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4211    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
4212    ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4213    ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4214    SRARI_H2_SH(tmp4, tmp5, 7);
4215    CLIP_SH2_0_255(tmp4, tmp5);
4216    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4217    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4218}
4219
4220static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
4221                                  int32_t src_stride,
4222                                  int16_t *src1_ptr,
4223                                  int32_t src2_stride,
4224                                  uint8_t *dst,
4225                                  int32_t dst_stride,
4226                                  const int8_t *filter_x,
4227                                  const int8_t *filter_y)
4228{
4229    v16u8 out;
4230    v16i8 src0, src1, src2, src3, src4;
4231    v8i16 filt0, filt1;
4232    v8i16 filt_h0, filt_h1;
4233    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4234    v16i8 mask1;
4235    v8i16 filter_vec, const_vec;
4236    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4237    v8i16 dst0, dst1, dst2, dst3, dst4;
4238    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4239    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4240    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4241    v8i16 tmp0, tmp1;
4242    v8i16 in0, in1;
4243
4244    src0_ptr -= (src_stride + 1);
4245
4246    filter_vec = LD_SH(filter_x);
4247    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4248
4249    filter_vec = LD_SH(filter_y);
4250    UNPCK_R_SB_SH(filter_vec, filter_vec);
4251
4252    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4253
4254    mask1 = mask0 + 2;
4255
4256    const_vec = __msa_ldi_h(128);
4257    const_vec <<= 6;
4258
4259    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4260    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4261
4262    LD_SH2(src1_ptr, src2_stride, in0, in1);
4263    ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4264
4265    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4266    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4267    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4268    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4269    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4270
4271    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4272    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4273    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4274    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4275    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4276
4277    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4278    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4279    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4280    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4281    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4282    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4283    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4284    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4285    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4286    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4287    ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1);
4288    SRARI_H2_SH(tmp0, tmp1, 7);
4289    CLIP_SH2_0_255(tmp0, tmp1);
4290    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4291    ST_D2(out, 0, 1, dst, dst_stride);
4292}
4293
4294static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
4295                                      int32_t src_stride,
4296                                      int16_t *src1_ptr,
4297                                      int32_t src2_stride,
4298                                      uint8_t *dst,
4299                                      int32_t dst_stride,
4300                                      const int8_t *filter_x,
4301                                      const int8_t *filter_y,
4302                                      int32_t width8mult)
4303{
4304    uint32_t cnt;
4305    v16u8 out0, out1;
4306    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4307    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4308    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4309    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4310    v8i16 in0, in1, in2, in3;
4311    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4312    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4313    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4314
4315    src0_ptr -= (src_stride + 1);
4316
4317    filter_vec = LD_SH(filter_x);
4318    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4319
4320    filter_vec = LD_SH(filter_y);
4321    UNPCK_R_SB_SH(filter_vec, filter_vec);
4322
4323    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4324
4325    mask0 = LD_SB(ff_hevc_mask_arr);
4326    mask1 = mask0 + 2;
4327
4328    const_vec = __msa_ldi_h(128);
4329    const_vec <<= 6;
4330
4331    for (cnt = width8mult; cnt--;) {
4332        LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4333        src0_ptr += 8;
4334        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4335
4336        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4337        src1_ptr += 8;
4338        ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4339                    const_vec, in0, in1, in2, in3);
4340
4341        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4342        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4343        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4344
4345        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4346        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4347        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4348
4349        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4350        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4351
4352        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4353        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4354        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4355        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4356
4357        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4358        dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4359        dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4360        dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4361
4362        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4363        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4364        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4365        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4366
4367        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4368        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4369        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4370        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4371        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4372        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4373        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4374        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4375
4376        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4377        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4378        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4379                    dst3_r, tmp0, tmp1, tmp2, tmp3);
4380        ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4381                    tmp0, tmp1, tmp2, tmp3);
4382        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4383        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4384        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4385        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4386        dst += 8;
4387    }
4388}
4389
4390static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
4391                                  int32_t src_stride,
4392                                  int16_t *src1_ptr,
4393                                  int32_t src2_stride,
4394                                  uint8_t *dst,
4395                                  int32_t dst_stride,
4396                                  const int8_t *filter_x,
4397                                  const int8_t *filter_y)
4398{
4399    v16u8 out0, out1, out2;
4400    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4401    v8i16 in0, in1, in2, in3, in4, in5;
4402    v8i16 filt0, filt1;
4403    v8i16 filt_h0, filt_h1;
4404    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4405    v16i8 mask1;
4406    v8i16 filter_vec, const_vec;
4407    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4408    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4409    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4410    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4411    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4412    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4413    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4414    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4415    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4416    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4417
4418    src0_ptr -= (src_stride + 1);
4419
4420    filter_vec = LD_SH(filter_x);
4421    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4422
4423    filter_vec = LD_SH(filter_y);
4424    UNPCK_R_SB_SH(filter_vec, filter_vec);
4425
4426    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4427
4428    mask1 = mask0 + 2;
4429
4430    const_vec = __msa_ldi_h(128);
4431    const_vec <<= 6;
4432
4433    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4434    src0_ptr += (5 * src_stride);
4435    LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4436
4437    XORI_B5_128_SB(src0, src1, src2, src3, src4);
4438    XORI_B4_128_SB(src5, src6, src7, src8);
4439
4440    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4441    ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4442                in0, in1, in2, in3);
4443    ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4444
4445    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4446    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4447    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4448    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4449    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4450    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4451    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4452    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4453    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4454
4455    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4456    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4457    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4458    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4459    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4460    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4461    dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4462    dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4463    dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4464
4465    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4466    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4467    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4468    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4469    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4470    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4471    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4472    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4473
4474    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4475    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4476    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4477    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4478    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4479    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4480    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4481    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4482    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4483    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4484    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4485    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4486
4487    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4488    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4489    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4490    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4491                tmp0, tmp1, tmp2, tmp3);
4492    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4493    ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4494                tmp0, tmp1, tmp2, tmp3);
4495    ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4496    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4497    SRARI_H2_SH(tmp4, tmp5, 7);
4498    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4499    CLIP_SH2_0_255(tmp4, tmp5);
4500    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4501    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4502    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4503    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4504}
4505
4506static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
4507                                          int32_t src_stride,
4508                                          int16_t *src1_ptr,
4509                                          int32_t src2_stride,
4510                                          uint8_t *dst,
4511                                          int32_t dst_stride,
4512                                          const int8_t *filter_x,
4513                                          const int8_t *filter_y,
4514                                          int32_t height,
4515                                          int32_t width)
4516{
4517    uint32_t loop_cnt, cnt;
4518    uint8_t *src0_ptr_tmp;
4519    int16_t *src1_ptr_tmp;
4520    uint8_t *dst_tmp;
4521    v16u8 out0, out1;
4522    v16i8 src0, src1, src2, src3, src4, src5, src6;
4523    v8i16 in0, in1, in2, in3;
4524    v8i16 filt0, filt1;
4525    v8i16 filt_h0, filt_h1;
4526    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4527    v16i8 mask1;
4528    v8i16 filter_vec, const_vec;
4529    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4530    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4531    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4532    v8i16 tmp0, tmp1, tmp2, tmp3;
4533    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4534    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4535    v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4536
4537    src0_ptr -= (src_stride + 1);
4538
4539    filter_vec = LD_SH(filter_x);
4540    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4541
4542    filter_vec = LD_SH(filter_y);
4543    UNPCK_R_SB_SH(filter_vec, filter_vec);
4544
4545    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4546
4547    mask1 = mask0 + 2;
4548
4549    const_vec = __msa_ldi_h(128);
4550    const_vec <<= 6;
4551
4552    for (cnt = width >> 3; cnt--;) {
4553        src0_ptr_tmp = src0_ptr;
4554        dst_tmp = dst;
4555        src1_ptr_tmp = src1_ptr;
4556
4557        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4558        src0_ptr_tmp += (3 * src_stride);
4559        XORI_B3_128_SB(src0, src1, src2);
4560
4561        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4562        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4563        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4564
4565        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4566        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4567        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4568
4569        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4570        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4571
4572        for (loop_cnt = height >> 2; loop_cnt--;) {
4573            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4574            src0_ptr_tmp += (4 * src_stride);
4575            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4576            src1_ptr_tmp += (4 * src2_stride);
4577            XORI_B4_128_SB(src3, src4, src5, src6);
4578
4579            ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4580                        const_vec, in0, in1, in2, in3);
4581
4582            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4583            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4584            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4585            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4586
4587            dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4588            dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4589            dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4590            dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4591
4592            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4593            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4594            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4595            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4596
4597            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4598            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4599            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4600            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4601            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4602            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4603            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4604            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4605
4606            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4607            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4608            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4609                        dst3_r, tmp0, tmp1, tmp2, tmp3);
4610            ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4611                        tmp0, tmp1, tmp2, tmp3);
4612            SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4613            CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4614            PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4615            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4616            dst_tmp += (4 * dst_stride);
4617
4618            dst10_r = dst54_r;
4619            dst10_l = dst54_l;
4620            dst21_r = dst65_r;
4621            dst21_l = dst65_l;
4622            dst2 = dst6;
4623        }
4624
4625        src0_ptr += 8;
4626        dst += 8;
4627        src1_ptr += 8;
4628    }
4629}
4630
4631static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
4632                                 int32_t src_stride,
4633                                 int16_t *src1_ptr,
4634                                 int32_t src2_stride,
4635                                 uint8_t *dst,
4636                                 int32_t dst_stride,
4637                                 const int8_t *filter_x,
4638                                 const int8_t *filter_y,
4639                                 int32_t height)
4640{
4641    if (2 == height) {
4642        hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4643                              dst, dst_stride, filter_x, filter_y);
4644    } else if (4 == height) {
4645        hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4646                                  dst, dst_stride, filter_x, filter_y, 1);
4647    } else if (6 == height) {
4648        hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4649                              dst, dst_stride, filter_x, filter_y);
4650    } else {
4651        hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
4652                                      src1_ptr, src2_stride,
4653                                      dst, dst_stride,
4654                                      filter_x, filter_y, height, 8);
4655    }
4656}
4657
4658static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
4659                                  int32_t src_stride,
4660                                  int16_t *src1_ptr,
4661                                  int32_t src2_stride,
4662                                  uint8_t *dst,
4663                                  int32_t dst_stride,
4664                                  const int8_t *filter_x,
4665                                  const int8_t *filter_y,
4666                                  int32_t height)
4667{
4668    uint32_t loop_cnt;
4669    uint64_t tp0, tp1;
4670    uint8_t *src0_ptr_tmp, *dst_tmp;
4671    int16_t *src1_ptr_tmp;
4672    v16u8 out0, out1;
4673    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4674    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675    v16i8 mask0, mask1, mask2, mask3;
4676    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4677    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4678    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4679    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4680    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4681    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4682    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4683    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4684    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4685
4686    src0_ptr -= (src_stride + 1);
4687
4688    filter_vec = LD_SH(filter_x);
4689    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4690
4691    filter_vec = LD_SH(filter_y);
4692    UNPCK_R_SB_SH(filter_vec, filter_vec);
4693
4694    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4695
4696    mask0 = LD_SB(ff_hevc_mask_arr);
4697    mask1 = mask0 + 2;
4698
4699    const_vec = __msa_ldi_h(128);
4700    const_vec <<= 6;
4701
4702    src0_ptr_tmp = src0_ptr;
4703    dst_tmp = dst;
4704    src1_ptr_tmp = src1_ptr;
4705
4706    LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4707    src0_ptr_tmp += (3 * src_stride);
4708
4709    XORI_B3_128_SB(src0, src1, src2);
4710
4711    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4712    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4713    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4714
4715    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4716    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4717    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4718
4719    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4720    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4721
4722    for (loop_cnt = 4; loop_cnt--;) {
4723        LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4724        src0_ptr_tmp += (4 * src_stride);
4725        XORI_B4_128_SB(src3, src4, src5, src6);
4726
4727        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4728        src1_ptr_tmp += (4 * src2_stride);
4729        ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4730                    const_vec, in0, in1, in2, in3);
4731
4732        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4733        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4734        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4735        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4736
4737        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4738        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4739        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4740        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4741
4742        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4743        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4744        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4745        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4746
4747        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4748        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4749        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4750        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4751        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4752        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4753        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4754        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4755
4756        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4757        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4758        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4759                    dst3_r, tmp0, tmp1, tmp2, tmp3);
4760        ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4761                    tmp0, tmp1, tmp2, tmp3);
4762        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4763        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4764        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4765        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4766        dst_tmp += (4 * dst_stride);
4767
4768        dst10_r = dst54_r;
4769        dst10_l = dst54_l;
4770        dst21_r = dst65_r;
4771        dst21_l = dst65_l;
4772        dsth2 = dsth6;
4773    }
4774
4775    src0_ptr += 8;
4776    dst += 8;
4777    src1_ptr += 8;
4778
4779    mask2 = LD_SB(ff_hevc_mask_arr + 16);
4780    mask3 = mask2 + 2;
4781
4782    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4783    src0_ptr += (3 * src_stride);
4784    XORI_B3_128_SB(src0, src1, src2);
4785    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4786    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4787
4788    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4789    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4790
4791    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4792    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4793
4794    for (loop_cnt = 2; loop_cnt--;) {
4795        LD_SB8(src0_ptr, src_stride,
4796               src3, src4, src5, src6, src7, src8, src9, src10);
4797        src0_ptr += (8 * src_stride);
4798        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4799        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4800        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4801        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4802        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4803
4804        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4805        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4806        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4807        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4808
4809        dst32_r = __msa_ilvr_h(dst73, dst22);
4810        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4811        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4812        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4813        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4814        dst76_r = __msa_ilvr_h(dst22, dst106);
4815
4816        LD2(src1_ptr, src2_stride, tp0, tp1);
4817        src1_ptr += 2 * src2_stride;
4818        INSERT_D2_SH(tp0, tp1, in0);
4819        LD2(src1_ptr, src2_stride, tp0, tp1);
4820        src1_ptr += 2 * src2_stride;
4821        INSERT_D2_SH(tp0, tp1, in1);
4822
4823        LD2(src1_ptr, src2_stride, tp0, tp1);
4824        src1_ptr += 2 * src2_stride;
4825        INSERT_D2_SH(tp0, tp1, in2);
4826        LD2(src1_ptr, src2_stride, tp0, tp1);
4827        src1_ptr += 2 * src2_stride;
4828        INSERT_D2_SH(tp0, tp1, in3);
4829
4830        ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4831                    const_vec, in0, in1, in2, in3);
4832
4833        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4834        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4835        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4836        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4837        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4838        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4839        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4840        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4841
4842        SRA_4V(dst0, dst1, dst2, dst3, 6);
4843        SRA_4V(dst4, dst5, dst6, dst7, 6);
4844        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4845                    tmp0, tmp1, tmp2, tmp3);
4846        ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4847                    tmp0, tmp1, tmp2, tmp3);
4848        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4849        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4850        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4851        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4852        dst += (8 * dst_stride);
4853
4854        dst10_r = dst98_r;
4855        dst21_r = dst109_r;
4856        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4857    }
4858}
4859
4860static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
4861                                  int32_t src_stride,
4862                                  int16_t *src1_ptr,
4863                                  int32_t src2_stride,
4864                                  uint8_t *dst,
4865                                  int32_t dst_stride,
4866                                  const int8_t *filter_x,
4867                                  const int8_t *filter_y,
4868                                  int32_t height)
4869{
4870    if (4 == height) {
4871        hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4872                                  dst, dst_stride, filter_x, filter_y, 2);
4873    } else {
4874        hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
4875                                      src2_stride, dst, dst_stride, filter_x,
4876                                      filter_y, height, 16);
4877    }
4878}
4879
4880static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
4881                                  int32_t src_stride,
4882                                  int16_t *src1_ptr,
4883                                  int32_t src2_stride,
4884                                  uint8_t *dst,
4885                                  int32_t dst_stride,
4886                                  const int8_t *filter_x,
4887                                  const int8_t *filter_y,
4888                                  int32_t height)
4889{
4890    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4891                                  dst, dst_stride, filter_x, filter_y,
4892                                  height, 24);
4893}
4894
4895static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
4896                                  int32_t src_stride,
4897                                  int16_t *src1_ptr,
4898                                  int32_t src2_stride,
4899                                  uint8_t *dst,
4900                                  int32_t dst_stride,
4901                                  const int8_t *filter_x,
4902                                  const int8_t *filter_y,
4903                                  int32_t height)
4904{
4905    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4906                                  dst, dst_stride, filter_x, filter_y,
4907                                  height, 32);
4908}
4909
4910#define BI_MC_COPY(WIDTH)                                                 \
4911void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
4912                                                   ptrdiff_t dst_stride,  \
4913                                                   uint8_t *src,          \
4914                                                   ptrdiff_t src_stride,  \
4915                                                   int16_t *src_16bit,    \
4916                                                   int height,            \
4917                                                   intptr_t mx,           \
4918                                                   intptr_t my,           \
4919                                                   int width)             \
4920{                                                                         \
4921    hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
4922                                dst, dst_stride, height);                 \
4923}
4924
4925BI_MC_COPY(4);
4926BI_MC_COPY(6);
4927BI_MC_COPY(8);
4928BI_MC_COPY(12);
4929BI_MC_COPY(16);
4930BI_MC_COPY(24);
4931BI_MC_COPY(32);
4932BI_MC_COPY(48);
4933BI_MC_COPY(64);
4934
4935#undef BI_MC_COPY
4936
4937#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
4938void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
4939                                                      ptrdiff_t dst_stride,  \
4940                                                      uint8_t *src,          \
4941                                                      ptrdiff_t src_stride,  \
4942                                                      int16_t *src_16bit,    \
4943                                                      int height,            \
4944                                                      intptr_t mx,           \
4945                                                      intptr_t my,           \
4946                                                      int width)             \
4947{                                                                            \
4948    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];            \
4949                                                                             \
4950    hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,     \
4951                                             MAX_PB_SIZE, dst, dst_stride,   \
4952                                             filter, height);                \
4953}
4954
4955BI_MC(qpel, h, 4, 8, hz, mx);
4956BI_MC(qpel, h, 8, 8, hz, mx);
4957BI_MC(qpel, h, 12, 8, hz, mx);
4958BI_MC(qpel, h, 16, 8, hz, mx);
4959BI_MC(qpel, h, 24, 8, hz, mx);
4960BI_MC(qpel, h, 32, 8, hz, mx);
4961BI_MC(qpel, h, 48, 8, hz, mx);
4962BI_MC(qpel, h, 64, 8, hz, mx);
4963
4964BI_MC(qpel, v, 4, 8, vt, my);
4965BI_MC(qpel, v, 8, 8, vt, my);
4966BI_MC(qpel, v, 12, 8, vt, my);
4967BI_MC(qpel, v, 16, 8, vt, my);
4968BI_MC(qpel, v, 24, 8, vt, my);
4969BI_MC(qpel, v, 32, 8, vt, my);
4970BI_MC(qpel, v, 48, 8, vt, my);
4971BI_MC(qpel, v, 64, 8, vt, my);
4972
4973BI_MC(epel, h, 4, 4, hz, mx);
4974BI_MC(epel, h, 8, 4, hz, mx);
4975BI_MC(epel, h, 6, 4, hz, mx);
4976BI_MC(epel, h, 12, 4, hz, mx);
4977BI_MC(epel, h, 16, 4, hz, mx);
4978BI_MC(epel, h, 24, 4, hz, mx);
4979BI_MC(epel, h, 32, 4, hz, mx);
4980
4981BI_MC(epel, v, 4, 4, vt, my);
4982BI_MC(epel, v, 8, 4, vt, my);
4983BI_MC(epel, v, 6, 4, vt, my);
4984BI_MC(epel, v, 12, 4, vt, my);
4985BI_MC(epel, v, 16, 4, vt, my);
4986BI_MC(epel, v, 24, 4, vt, my);
4987BI_MC(epel, v, 32, 4, vt, my);
4988
4989#undef BI_MC
4990
4991#define BI_MC_HV(PEL, WIDTH, TAP)                                         \
4992void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \
4993                                                   ptrdiff_t dst_stride,  \
4994                                                   uint8_t *src,          \
4995                                                   ptrdiff_t src_stride,  \
4996                                                   int16_t *src_16bit,    \
4997                                                   int height,            \
4998                                                   intptr_t mx,           \
4999                                                   intptr_t my,           \
5000                                                   int width)             \
5001{                                                                         \
5002    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];             \
5003    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];             \
5004                                                                          \
5005    hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,        \
5006                                       MAX_PB_SIZE, dst, dst_stride,      \
5007                                       filter_x, filter_y, height);       \
5008}
5009
5010BI_MC_HV(qpel, 4, 8);
5011BI_MC_HV(qpel, 8, 8);
5012BI_MC_HV(qpel, 12, 8);
5013BI_MC_HV(qpel, 16, 8);
5014BI_MC_HV(qpel, 24, 8);
5015BI_MC_HV(qpel, 32, 8);
5016BI_MC_HV(qpel, 48, 8);
5017BI_MC_HV(qpel, 64, 8);
5018
5019BI_MC_HV(epel, 4, 4);
5020BI_MC_HV(epel, 8, 4);
5021BI_MC_HV(epel, 6, 4);
5022BI_MC_HV(epel, 12, 4);
5023BI_MC_HV(epel, 16, 4);
5024BI_MC_HV(epel, 24, 4);
5025BI_MC_HV(epel, 32, 4);
5026
5027#undef BI_MC_HV
5028