1/*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "libavcodec/mips/hevcdsp_mips.h"
23#include "libavcodec/mips/hevc_macros_msa.h"
24
25static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26    /* 8 width cases */
27    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28    /* 4 width cases */
29    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30};
31
32static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
33                             int16_t *dst, int32_t dst_stride,
34                             int32_t height)
35{
36    v16i8 zero = { 0 };
37
38    if (2 == height) {
39        v16i8 src0, src1;
40        v8i16 in0;
41
42        LD_SB2(src, src_stride, src0, src1);
43
44        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
45        in0 = (v8i16) __msa_ilvr_b(zero, src0);
46        in0 <<= 6;
47        ST_D2(in0, 0, 1, dst, dst_stride);
48    } else if (4 == height) {
49        v16i8 src0, src1, src2, src3;
50        v8i16 in0, in1;
51
52        LD_SB4(src, src_stride, src0, src1, src2, src3);
53
54        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
55        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
56        in0 <<= 6;
57        in1 <<= 6;
58        ST_D4(in0, in1, 0, 1, 0, 1, dst, dst_stride);
59    } else if (0 == height % 8) {
60        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
61        v8i16 in0, in1, in2, in3;
62        uint32_t loop_cnt;
63
64        for (loop_cnt = (height >> 3); loop_cnt--;) {
65            LD_SB8(src, src_stride,
66                   src0, src1, src2, src3, src4, src5, src6, src7);
67            src += (8 * src_stride);
68
69            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
70                       src0, src1, src2, src3);
71            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
72                       in0, in1, in2, in3);
73            SLLI_4V(in0, in1, in2, in3, 6);
74            ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
75            dst += (8 * dst_stride);
76        }
77    }
78}
79
80static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride,
81                             int16_t *dst, int32_t dst_stride,
82                             int32_t height)
83{
84    uint32_t loop_cnt;
85    v16i8 zero = { 0 };
86    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
87    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
88
89    for (loop_cnt = (height >> 3); loop_cnt--;) {
90        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
91        src += (8 * src_stride);
92
93        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
94                   in0, in1, in2, in3);
95        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
96                   in4, in5, in6, in7);
97        SLLI_4V(in0, in1, in2, in3, 6);
98        SLLI_4V(in4, in5, in6, in7, 6);
99        ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
100        dst += (8 * dst_stride);
101    }
102}
103
104static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride,
105                             int16_t *dst, int32_t dst_stride,
106                             int32_t height)
107{
108    v16i8 zero = { 0 };
109
110    if (2 == height) {
111        v16i8 src0, src1;
112        v8i16 in0, in1;
113
114        LD_SB2(src, src_stride, src0, src1);
115
116        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
117        in0 <<= 6;
118        in1 <<= 6;
119        ST_SH2(in0, in1, dst, dst_stride);
120    } else if (4 == height) {
121        v16i8 src0, src1, src2, src3;
122        v8i16 in0, in1, in2, in3;
123
124        LD_SB4(src, src_stride, src0, src1, src2, src3);
125
126        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
127                   in0, in1, in2, in3);
128        SLLI_4V(in0, in1, in2, in3, 6);
129        ST_SH4(in0, in1, in2, in3, dst, dst_stride);
130    } else if (6 == height) {
131        v16i8 src0, src1, src2, src3, src4, src5;
132        v8i16 in0, in1, in2, in3, in4, in5;
133
134        LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
135
136        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
137                   in0, in1, in2, in3);
138        ILVR_B2_SH(zero, src4, zero, src5, in4, in5);
139        SLLI_4V(in0, in1, in2, in3, 6);
140        in4 <<= 6;
141        in5 <<= 6;
142        ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
143    } else if (0 == height % 8) {
144        uint32_t loop_cnt;
145        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
146        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
147
148        for (loop_cnt = (height >> 3); loop_cnt--;) {
149            LD_SB8(src, src_stride,
150                   src0, src1, src2, src3, src4, src5, src6, src7);
151            src += (8 * src_stride);
152
153            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
154                       in0, in1, in2, in3);
155            ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
156                       in4, in5, in6, in7);
157            SLLI_4V(in0, in1, in2, in3, 6);
158            SLLI_4V(in4, in5, in6, in7, 6);
159            ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
160            dst += (8 * dst_stride);
161        }
162    }
163}
164
165static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
166                              int16_t *dst, int32_t dst_stride,
167                              int32_t height)
168{
169    uint32_t loop_cnt;
170    v16i8 zero = { 0 };
171    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
172    v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
173
174    for (loop_cnt = (height >> 3); loop_cnt--;) {
175        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
176        src += (8 * src_stride);
177
178        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
179                   in0_r, in1_r, in2_r, in3_r);
180        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
181        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
182        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
183        in0 <<= 6;
184        in1 <<= 6;
185        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
186        ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
187        dst += (4 * dst_stride);
188
189        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
190                   in0_r, in1_r, in2_r, in3_r);
191        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
192        ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
193        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
194        in0 <<= 6;
195        in1 <<= 6;
196        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
197        ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
198        dst += (4 * dst_stride);
199    }
200}
201
202static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride,
203                              int16_t *dst, int32_t dst_stride,
204                              int32_t height)
205{
206    v16i8 zero = { 0 };
207
208    if (4 == height) {
209        v16i8 src0, src1, src2, src3;
210        v8i16 in0_r, in1_r, in2_r, in3_r;
211        v8i16 in0_l, in1_l, in2_l, in3_l;
212
213        LD_SB4(src, src_stride, src0, src1, src2, src3);
214
215        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
216                   in0_r, in1_r, in2_r, in3_r);
217        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
218                   in0_l, in1_l, in2_l, in3_l);
219        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
220        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
221        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
222        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
223    } else if (12 == height) {
224        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
225        v16i8 src8, src9, src10, src11;
226        v8i16 in0_r, in1_r, in2_r, in3_r;
227        v8i16 in0_l, in1_l, in2_l, in3_l;
228
229        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
230        src += (8 * src_stride);
231        LD_SB4(src, src_stride, src8, src9, src10, src11);
232
233        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
234                   in0_r, in1_r, in2_r, in3_r);
235        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
236                   in0_l, in1_l, in2_l, in3_l);
237        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
238        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
239        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
240        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
241        dst += (4 * dst_stride);
242
243        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
244                   in0_r, in1_r, in2_r, in3_r);
245        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
246                   in0_l, in1_l, in2_l, in3_l);
247        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
248        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
249        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
250        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
251        dst += (4 * dst_stride);
252
253        ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
254                   in0_r, in1_r, in2_r, in3_r);
255        ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
256                   in0_l, in1_l, in2_l, in3_l);
257        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
258        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
259        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
260        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
261    } else if (0 == (height % 8)) {
262        uint32_t loop_cnt;
263        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
264        v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
265
266        for (loop_cnt = (height >> 3); loop_cnt--;) {
267            LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
268                   src7);
269            src += (8 * src_stride);
270            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r,
271                       in1_r, in2_r, in3_r);
272            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l,
273                       in1_l, in2_l, in3_l);
274            SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
275            SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
276            ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
277            ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
278            dst += (4 * dst_stride);
279
280            ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r,
281                       in1_r, in2_r, in3_r);
282            ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l,
283                       in1_l, in2_l, in3_l);
284            SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
285            SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
286            ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
287            ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
288            dst += (4 * dst_stride);
289        }
290    }
291}
292
293static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride,
294                              int16_t *dst, int32_t dst_stride,
295                              int32_t height)
296{
297    uint32_t loop_cnt;
298    v16i8 zero = { 0 };
299    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
300    v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
301
302    for (loop_cnt = (height >> 2); loop_cnt--;) {
303        LD_SB4(src, src_stride, src0, src1, src2, src3);
304        LD_SB4((src + 16), src_stride, src4, src5, src6, src7);
305        src += (4 * src_stride);
306        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
307                   in2_r, in3_r);
308        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
309                   in2_l, in3_l);
310        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
311        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
312        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
313        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
314        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
315                   in2_r, in3_r);
316        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
317        ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
318        dst += (4 * dst_stride);
319    }
320}
321
322static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
323                              int16_t *dst, int32_t dst_stride,
324                              int32_t height)
325{
326    uint32_t loop_cnt;
327    v16i8 zero = { 0 };
328    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
329    v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
330
331    for (loop_cnt = (height >> 2); loop_cnt--;) {
332        LD_SB4(src, src_stride, src0, src2, src4, src6);
333        LD_SB4((src + 16), src_stride, src1, src3, src5, src7);
334        src += (4 * src_stride);
335
336        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
337                   in2_r, in3_r);
338        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
339                   in2_l, in3_l);
340        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
341        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
342        ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
343        dst += dst_stride;
344        ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
345        dst += dst_stride;
346
347        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
348                   in2_r, in3_r);
349        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l,
350                   in2_l, in3_l);
351        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
352        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
353        ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
354        dst += dst_stride;
355        ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
356        dst += dst_stride;
357    }
358}
359
360static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride,
361                              int16_t *dst, int32_t dst_stride,
362                              int32_t height)
363{
364    uint32_t loop_cnt;
365    v16i8 zero = { 0 };
366    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
367    v16i8 src8, src9, src10, src11;
368    v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
369    v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
370
371    for (loop_cnt = (height >> 2); loop_cnt--;) {
372        LD_SB3(src, 16, src0, src1, src2);
373        src += src_stride;
374        LD_SB3(src, 16, src3, src4, src5);
375        src += src_stride;
376        LD_SB3(src, 16, src6, src7, src8);
377        src += src_stride;
378        LD_SB3(src, 16, src9, src10, src11);
379        src += src_stride;
380
381        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
382                   in0_r, in1_r, in2_r, in3_r);
383        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
384                   in0_l, in1_l, in2_l, in3_l);
385        ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r);
386        ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l);
387        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
388        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
389        SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
390        ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
391        dst += dst_stride;
392        ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
393        dst += dst_stride;
394
395        ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
396                   in0_r, in1_r, in2_r, in3_r);
397        ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
398                   in0_l, in1_l, in2_l, in3_l);
399        ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r);
400        ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l);
401        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
402        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
403        SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
404        ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
405        dst += dst_stride;
406        ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
407        dst += dst_stride;
408    }
409}
410
411static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride,
412                              int16_t *dst, int32_t dst_stride,
413                              int32_t height)
414{
415    uint32_t loop_cnt;
416    v16i8 zero = { 0 };
417    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
418    v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
419
420    for (loop_cnt = (height >> 1); loop_cnt--;) {
421        LD_SB4(src, 16, src0, src1, src2, src3);
422        src += src_stride;
423        LD_SB4(src, 16, src4, src5, src6, src7);
424        src += src_stride;
425
426        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
427                   in0_r, in1_r, in2_r, in3_r);
428        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
429                   in0_l, in1_l, in2_l, in3_l);
430        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
431        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
432        ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
433        ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
434        dst += dst_stride;
435
436        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
437                   in0_r, in1_r, in2_r, in3_r);
438        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
439                   in0_l, in1_l, in2_l, in3_l);
440        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
441        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
442        ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
443        ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
444        dst += dst_stride;
445    }
446}
447
448static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
449                              int16_t *dst, int32_t dst_stride,
450                              const int8_t *filter, int32_t height)
451{
452    uint32_t loop_cnt;
453    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
454    v8i16 filt0, filt1, filt2, filt3;
455    v16i8 mask1, mask2, mask3;
456    v16i8 vec0, vec1, vec2, vec3;
457    v8i16 dst0, dst1, dst2, dst3;
458    v8i16 filter_vec, const_vec;
459    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
460
461    src -= 3;
462    const_vec = __msa_ldi_h(128);
463    const_vec <<= 6;
464
465    filter_vec = LD_SH(filter);
466    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
467
468    mask1 = mask0 + 2;
469    mask2 = mask0 + 4;
470    mask3 = mask0 + 6;
471
472    for (loop_cnt = (height >> 3); loop_cnt--;) {
473        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
474        src += (8 * src_stride);
475        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
476
477        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
478                   vec0, vec1, vec2, vec3);
479        dst0 = const_vec;
480        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
481                     dst0, dst0, dst0, dst0);
482        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
483                   vec0, vec1, vec2, vec3);
484        dst1 = const_vec;
485        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
486                     dst1, dst1, dst1, dst1);
487        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
488                   vec0, vec1, vec2, vec3);
489        dst2 = const_vec;
490        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
491                     dst2, dst2, dst2, dst2);
492        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
493                   vec0, vec1, vec2, vec3);
494        dst3 = const_vec;
495        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
496                     dst3, dst3, dst3, dst3);
497
498        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
499        dst += (8 * dst_stride);
500    }
501}
502
503static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
504                              int16_t *dst, int32_t dst_stride,
505                              const int8_t *filter, int32_t height)
506{
507    uint32_t loop_cnt;
508    v16i8 src0, src1, src2, src3;
509    v8i16 filt0, filt1, filt2, filt3;
510    v16i8 mask1, mask2, mask3;
511    v16i8 vec0, vec1, vec2, vec3;
512    v8i16 dst0, dst1, dst2, dst3;
513    v8i16 filter_vec, const_vec;
514    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
515
516    src -= 3;
517    const_vec = __msa_ldi_h(128);
518    const_vec <<= 6;
519
520    filter_vec = LD_SH(filter);
521    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
522
523    mask1 = mask0 + 2;
524    mask2 = mask0 + 4;
525    mask3 = mask0 + 6;
526
527    for (loop_cnt = (height >> 2); loop_cnt--;) {
528        LD_SB4(src, src_stride, src0, src1, src2, src3);
529        src += (4 * src_stride);
530        XORI_B4_128_SB(src0, src1, src2, src3);
531
532        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
533                   vec0, vec1, vec2, vec3);
534        dst0 = const_vec;
535        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
536                     dst0, dst0, dst0, dst0);
537        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
538                   vec0, vec1, vec2, vec3);
539        dst1 = const_vec;
540        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
541                     dst1, dst1, dst1, dst1);
542        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
543                   vec0, vec1, vec2, vec3);
544        dst2 = const_vec;
545        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
546                     dst2, dst2, dst2, dst2);
547        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
548                   vec0, vec1, vec2, vec3);
549        dst3 = const_vec;
550        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
551                     dst3, dst3, dst3, dst3);
552
553        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
554        dst += (4 * dst_stride);
555    }
556}
557
558static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
559                               int16_t *dst, int32_t dst_stride,
560                               const int8_t *filter, int32_t height)
561{
562    uint32_t loop_cnt;
563    int64_t res0, res1, res2, res3;
564    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
565    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
566    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
567    v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
568    v8i16 filter_vec, const_vec;
569
570    src -= 3;
571    const_vec = __msa_ldi_h(128);
572    const_vec <<= 6;
573
574    filter_vec = LD_SH(filter);
575    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
576
577    mask0 = LD_SB(ff_hevc_mask_arr);
578    mask1 = mask0 + 2;
579    mask2 = mask0 + 4;
580    mask3 = mask0 + 6;
581    mask4 = LD_SB(ff_hevc_mask_arr + 16);
582    mask5 = mask4 + 2;
583    mask6 = mask4 + 4;
584    mask7 = mask4 + 6;
585
586    for (loop_cnt = 4; loop_cnt--;) {
587        LD_SB4(src, src_stride, src0, src1, src2, src3);
588        LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
589        src += (4 * src_stride);
590        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
591
592        dst0 = const_vec;
593        dst1 = const_vec;
594        dst2 = const_vec;
595        dst3 = const_vec;
596        dst4 = const_vec;
597        dst5 = const_vec;
598        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
599        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
600        VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5);
601        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
602                     dst1, dst2, dst3);
603        DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
604        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
605        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
606        VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
607        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
608                     dst1, dst2, dst3);
609        DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
610        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
611        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
612        VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5);
613        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
614                     dst1, dst2, dst3);
615        DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
616        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
617        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
618        VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5);
619        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
620                     dst1, dst2, dst3);
621        DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
622
623        res0 = __msa_copy_s_d((v2i64) dst4, 0);
624        res1 = __msa_copy_s_d((v2i64) dst4, 1);
625        res2 = __msa_copy_s_d((v2i64) dst5, 0);
626        res3 = __msa_copy_s_d((v2i64) dst5, 1);
627        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
628        SD4(res0, res1, res2, res3, (dst + 8), dst_stride);
629        dst += (4 * dst_stride);
630    }
631}
632
633static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
634                               int16_t *dst, int32_t dst_stride,
635                               const int8_t *filter, int32_t height)
636{
637    uint32_t loop_cnt;
638    v16i8 src0, src1, src2, src3;
639    v8i16 filt0, filt1, filt2, filt3;
640    v16i8 mask1, mask2, mask3;
641    v16i8 vec0, vec1, vec2, vec3;
642    v8i16 dst0, dst1, dst2, dst3;
643    v8i16 filter_vec, const_vec;
644    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
645
646    src -= 3;
647    const_vec = __msa_ldi_h(128);
648    const_vec <<= 6;
649
650    filter_vec = LD_SH(filter);
651    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
652
653    mask1 = mask0 + 2;
654    mask2 = mask0 + 4;
655    mask3 = mask0 + 6;
656
657    for (loop_cnt = (height >> 1); loop_cnt--;) {
658        LD_SB2(src, src_stride, src0, src2);
659        LD_SB2(src + 8, src_stride, src1, src3);
660        src += (2 * src_stride);
661        XORI_B4_128_SB(src0, src1, src2, src3);
662
663        dst0 = const_vec;
664        dst1 = const_vec;
665        dst2 = const_vec;
666        dst3 = const_vec;
667        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
668        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
669        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
670                     dst1, dst2, dst3);
671        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
672        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
673        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
674                     dst1, dst2, dst3);
675        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
676        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
677        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
678                     dst1, dst2, dst3);
679        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
680        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
681        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
682                     dst1, dst2, dst3);
683
684        ST_SH2(dst0, dst2, dst, dst_stride);
685        ST_SH2(dst1, dst3, dst + 8, dst_stride);
686        dst += (2 * dst_stride);
687    }
688}
689
690static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
691                               int16_t *dst, int32_t dst_stride,
692                               const int8_t *filter, int32_t height)
693{
694    uint32_t loop_cnt;
695    v16i8 src0, src1, src2, src3;
696    v8i16 filt0, filt1, filt2, filt3;
697    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
698    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
699    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
700    v8i16 filter_vec, const_vec;
701    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
702
703    src -= 3;
704    filter_vec = LD_SH(filter);
705    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
706
707    mask1 = mask0 + 2;
708    mask2 = mask0 + 4;
709    mask3 = mask0 + 6;
710    mask4 = mask0 + 8;
711    mask5 = mask0 + 10;
712    mask6 = mask0 + 12;
713    mask7 = mask0 + 14;
714
715    const_vec = __msa_ldi_h(128);
716    const_vec <<= 6;
717
718    for (loop_cnt = (height >> 1); loop_cnt--;) {
719        LD_SB2(src, 16, src0, src1);
720        src += src_stride;
721        LD_SB2(src, 16, src2, src3);
722        src += src_stride;
723        XORI_B4_128_SB(src0, src1, src2, src3);
724
725        dst0 = const_vec;
726        dst1 = const_vec;
727        dst2 = const_vec;
728        dst3 = const_vec;
729        dst4 = const_vec;
730        dst5 = const_vec;
731        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
732        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
733        VSHF_B2_SB(src2, src3, src3, src3, mask4, mask0, vec4, vec5);
734        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
735                     dst1, dst2, dst3);
736        DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
737        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
738        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
739        VSHF_B2_SB(src2, src3, src3, src3, mask5, mask1, vec4, vec5);
740        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
741                     dst1, dst2, dst3);
742        DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
743        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
744        VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
745        VSHF_B2_SB(src2, src3, src3, src3, mask6, mask2, vec4, vec5);
746        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
747                     dst1, dst2, dst3);
748        DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
749        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
750        VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
751        VSHF_B2_SB(src2, src3, src3, src3, mask7, mask3, vec4, vec5);
752        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
753                     dst1, dst2, dst3);
754        DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
755
756        ST_SH2(dst0, dst1, dst, 8);
757        ST_SH(dst2, dst + 16);
758        dst += dst_stride;
759        ST_SH2(dst3, dst4, dst, 8);
760        ST_SH(dst5, dst + 16);
761        dst += dst_stride;
762    }
763}
764
765static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
766                               int16_t *dst, int32_t dst_stride,
767                               const int8_t *filter, int32_t height)
768{
769    uint32_t loop_cnt;
770    v16i8 src0, src1, src2;
771    v8i16 filt0, filt1, filt2, filt3;
772    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
773    v16i8 vec0, vec1, vec2, vec3;
774    v8i16 dst0, dst1, dst2, dst3;
775    v8i16 filter_vec, const_vec;
776    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
777
778    src -= 3;
779    filter_vec = LD_SH(filter);
780    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
781
782    mask1 = mask0 + 2;
783    mask2 = mask0 + 4;
784    mask3 = mask0 + 6;
785    mask4 = mask0 + 8;
786    mask5 = mask0 + 10;
787    mask6 = mask0 + 12;
788    mask7 = mask0 + 14;
789
790    const_vec = __msa_ldi_h(128);
791    const_vec <<= 6;
792
793    for (loop_cnt = height; loop_cnt--;) {
794        LD_SB2(src, 16, src0, src1);
795        src2 = LD_SB(src + 24);
796        src += src_stride;
797        XORI_B3_128_SB(src0, src1, src2);
798
799        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
800                   vec0, vec1, vec2, vec3);
801        dst0 = const_vec;
802        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
803                     dst0, dst0, dst0, dst0);
804        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
805                   vec0, vec1, vec2, vec3);
806        dst1 = const_vec;
807        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
808                     dst1, dst1, dst1, dst1);
809        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
810                   vec0, vec1, vec2, vec3);
811        dst2 = const_vec;
812        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
813                     dst2, dst2, dst2, dst2);
814        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
815                   vec0, vec1, vec2, vec3);
816        dst3 = const_vec;
817        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
818                     dst3, dst3, dst3, dst3);
819
820        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
821        dst += dst_stride;
822    }
823}
824
825static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
826                               int16_t *dst, int32_t dst_stride,
827                               const int8_t *filter, int32_t height)
828{
829    uint32_t loop_cnt;
830    v16i8 src0, src1, src2, src3;
831    v8i16 filt0, filt1, filt2, filt3;
832    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
833    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
834    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
835    v8i16 filter_vec, const_vec;
836    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
837
838    src -= 3;
839    filter_vec = LD_SH(filter);
840    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
841
842    mask1 = mask0 + 2;
843    mask2 = mask0 + 4;
844    mask3 = mask0 + 6;
845    mask4 = mask0 + 8;
846    mask5 = mask0 + 10;
847    mask6 = mask0 + 12;
848    mask7 = mask0 + 14;
849
850    const_vec = __msa_ldi_h(128);
851    const_vec <<= 6;
852
853    for (loop_cnt = height; loop_cnt--;) {
854        LD_SB3(src, 16, src0, src1, src2);
855        src3 = LD_SB(src + 40);
856        src += src_stride;
857        XORI_B4_128_SB(src0, src1, src2, src3);
858
859        dst0 = const_vec;
860        dst1 = const_vec;
861        dst2 = const_vec;
862        dst3 = const_vec;
863        dst4 = const_vec;
864        dst5 = const_vec;
865        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
866        VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
867        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
868                     dst1, dst2, dst3);
869        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
870        VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
871        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
872                     dst1, dst2, dst3);
873        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
874        VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
875        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
876                     dst1, dst2, dst3);
877        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
878        VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
879        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
880                     dst1, dst2, dst3);
881        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
882
883        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec4, vec5);
884        DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
885        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec4, vec5);
886        DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
887        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec4, vec5);
888        DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
889        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec4, vec5);
890        DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
891        ST_SH2(dst4, dst5, (dst + 32), 8);
892        dst += dst_stride;
893    }
894}
895
896static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
897                               int16_t *dst, int32_t dst_stride,
898                               const int8_t *filter, int32_t height)
899{
900    uint32_t loop_cnt;
901    v16i8 src0, src1, src2, src3, src4;
902    v8i16 filt0, filt1, filt2, filt3;
903    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
904    v16i8 vec0, vec1, vec2, vec3;
905    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
906    v8i16 filter_vec, const_vec;
907    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
908
909    src -= 3;
910
911    filter_vec = LD_SH(filter);
912    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
913
914    mask1 = mask0 + 2;
915    mask2 = mask0 + 4;
916    mask3 = mask0 + 6;
917    mask4 = mask0 + 8;
918    mask5 = mask0 + 10;
919    mask6 = mask0 + 12;
920    mask7 = mask0 + 14;
921
922    const_vec = __msa_ldi_h(128);
923    const_vec <<= 6;
924
925    for (loop_cnt = height; loop_cnt--;) {
926        LD_SB4(src, 16, src0, src1, src2, src3);
927        src4 = LD_SB(src + 56);
928        src += src_stride;
929        XORI_B5_128_SB(src0, src1, src2, src3, src4);
930
931        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
932                   vec0, vec1, vec2, vec3);
933        dst0 = const_vec;
934        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
935                     dst0, dst0, dst0, dst0);
936        ST_SH(dst0, dst);
937
938        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
939                   vec0, vec1, vec2, vec3);
940        dst1 = const_vec;
941        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
942                     dst1, dst1, dst1, dst1);
943        ST_SH(dst1, dst + 8);
944
945        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
946                   vec0, vec1, vec2, vec3);
947        dst2 = const_vec;
948        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
949                     dst2, dst2, dst2, dst2);
950        ST_SH(dst2, dst + 16);
951
952        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
953                   vec0, vec1, vec2, vec3);
954        dst3 = const_vec;
955        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
956                     dst3, dst3, dst3, dst3);
957        ST_SH(dst3, dst + 24);
958
959        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
960                   vec0, vec1, vec2, vec3);
961        dst4 = const_vec;
962        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
963                     dst4, dst4, dst4, dst4);
964        ST_SH(dst4, dst + 32);
965
966        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
967                   vec0, vec1, vec2, vec3);
968        dst5 = const_vec;
969        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
970                     dst5, dst5, dst5, dst5);
971        ST_SH(dst5, dst + 40);
972
973        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
974                   vec0, vec1, vec2, vec3);
975        dst6 = const_vec;
976        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
977                     dst6, dst6, dst6, dst6);
978        ST_SH(dst6, dst + 48);
979
980        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
981                   vec0, vec1, vec2, vec3);
982        dst7 = const_vec;
983        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
984                     dst7, dst7, dst7, dst7);
985        ST_SH(dst7, dst + 56);
986        dst += dst_stride;
987    }
988}
989
990static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
991                              int16_t *dst, int32_t dst_stride,
992                              const int8_t *filter, int32_t height)
993{
994    int32_t loop_cnt;
995    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
996    v16i8 src9, src10, src11, src12, src13, src14;
997    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
998    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
999    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1000    v16i8 src2110, src4332, src6554, src8776, src10998;
1001    v16i8 src12111110, src14131312;
1002    v8i16 dst10, dst32, dst54, dst76;
1003    v8i16 filt0, filt1, filt2, filt3;
1004    v8i16 filter_vec, const_vec;
1005
1006    src -= (3 * src_stride);
1007
1008    const_vec = __msa_ldi_h(128);
1009    const_vec <<= 6;
1010
1011    filter_vec = LD_SH(filter);
1012    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1013
1014    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1015    src += (7 * src_stride);
1016    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1017               src10_r, src32_r, src54_r, src21_r);
1018    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1019    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1020               src2110, src4332, src6554);
1021    XORI_B3_128_SB(src2110, src4332, src6554);
1022
1023    for (loop_cnt = (height >> 3); loop_cnt--;) {
1024        LD_SB8(src, src_stride,
1025               src7, src8, src9, src10, src11, src12, src13, src14);
1026        src += (8 * src_stride);
1027
1028        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1029                   src76_r, src87_r, src98_r, src109_r);
1030        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1031                   src1110_r, src1211_r, src1312_r, src1413_r);
1032        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
1033                   src1211_r, src1110_r, src1413_r, src1312_r,
1034                   src8776, src10998, src12111110, src14131312);
1035        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1036
1037        dst10 = const_vec;
1038        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1039                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1040        dst32 = const_vec;
1041        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1042                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1043        dst54 = const_vec;
1044        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1045                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1046        dst76 = const_vec;
1047        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1048                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1049
1050        ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
1051        dst += (8 * dst_stride);
1052
1053        src2110 = src10998;
1054        src4332 = src12111110;
1055        src6554 = src14131312;
1056        src6 = src14;
1057    }
1058}
1059
1060static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1061                              int16_t *dst, int32_t dst_stride,
1062                              const int8_t *filter, int32_t height)
1063{
1064    int32_t loop_cnt;
1065    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1066    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1067    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1068    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1069    v8i16 filter_vec, const_vec;
1070    v8i16 filt0, filt1, filt2, filt3;
1071
1072    src -= (3 * src_stride);
1073    const_vec = __msa_ldi_h(128);
1074    const_vec <<= 6;
1075
1076    filter_vec = LD_SH(filter);
1077    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1078
1079    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1080    src += (7 * src_stride);
1081    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1082    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1083               src10_r, src32_r, src54_r, src21_r);
1084    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1085
1086    for (loop_cnt = (height >> 2); loop_cnt--;) {
1087        LD_SB4(src, src_stride, src7, src8, src9, src10);
1088        src += (4 * src_stride);
1089        XORI_B4_128_SB(src7, src8, src9, src10);
1090        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1091                   src76_r, src87_r, src98_r, src109_r);
1092
1093        dst0_r = const_vec;
1094        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1095                     filt0, filt1, filt2, filt3,
1096                     dst0_r, dst0_r, dst0_r, dst0_r);
1097        dst1_r = const_vec;
1098        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1099                     filt0, filt1, filt2, filt3,
1100                     dst1_r, dst1_r, dst1_r, dst1_r);
1101        dst2_r = const_vec;
1102        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1103                     filt0, filt1, filt2, filt3,
1104                     dst2_r, dst2_r, dst2_r, dst2_r);
1105        dst3_r = const_vec;
1106        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1107                     filt0, filt1, filt2, filt3,
1108                     dst3_r, dst3_r, dst3_r, dst3_r);
1109
1110        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1111        dst += (4 * dst_stride);
1112
1113        src10_r = src54_r;
1114        src32_r = src76_r;
1115        src54_r = src98_r;
1116        src21_r = src65_r;
1117        src43_r = src87_r;
1118        src65_r = src109_r;
1119        src6 = src10;
1120    }
1121}
1122
1123static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1124                               int16_t *dst, int32_t dst_stride,
1125                               const int8_t *filter, int32_t height)
1126{
1127    int32_t loop_cnt;
1128    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1129    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1130    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1131    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1132    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1133    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1134    v16i8 src2110, src4332, src6554, src8776, src10998;
1135    v8i16 dst0_l, dst1_l;
1136    v8i16 filter_vec, const_vec;
1137    v8i16 filt0, filt1, filt2, filt3;
1138
1139    src -= (3 * src_stride);
1140    const_vec = __msa_ldi_h(128);
1141    const_vec <<= 6;
1142
1143    filter_vec = LD_SH(filter);
1144    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1145
1146    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1147    src += (7 * src_stride);
1148    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1149    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1150               src10_r, src32_r, src54_r, src21_r);
1151    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1152    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1153               src10_l, src32_l, src54_l, src21_l);
1154    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1155    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1156               src2110, src4332, src6554);
1157
1158    for (loop_cnt = (height >> 2); loop_cnt--;) {
1159        LD_SB4(src, src_stride, src7, src8, src9, src10);
1160        src += (4 * src_stride);
1161        XORI_B4_128_SB(src7, src8, src9, src10);
1162        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1163                   src76_r, src87_r, src98_r, src109_r);
1164        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1165                   src76_l, src87_l, src98_l, src109_l);
1166        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1167
1168        dst0_r = const_vec;
1169        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1170                     filt0, filt1, filt2, filt3,
1171                     dst0_r, dst0_r, dst0_r, dst0_r);
1172        dst1_r = const_vec;
1173        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1174                     filt0, filt1, filt2, filt3,
1175                     dst1_r, dst1_r, dst1_r, dst1_r);
1176        dst2_r = const_vec;
1177        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1178                     filt0, filt1, filt2, filt3,
1179                     dst2_r, dst2_r, dst2_r, dst2_r);
1180        dst3_r = const_vec;
1181        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1182                     filt0, filt1, filt2, filt3,
1183                     dst3_r, dst3_r, dst3_r, dst3_r);
1184        dst0_l = const_vec;
1185        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1186                     filt0, filt1, filt2, filt3,
1187                     dst0_l, dst0_l, dst0_l, dst0_l);
1188        dst1_l = const_vec;
1189        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1190                     filt0, filt1, filt2, filt3,
1191                     dst1_l, dst1_l, dst1_l, dst1_l);
1192
1193        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1194        ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
1195        dst += (4 * dst_stride);
1196
1197        src10_r = src54_r;
1198        src32_r = src76_r;
1199        src54_r = src98_r;
1200        src21_r = src65_r;
1201        src43_r = src87_r;
1202        src65_r = src109_r;
1203        src2110 = src6554;
1204        src4332 = src8776;
1205        src6554 = src10998;
1206        src6 = src10;
1207    }
1208}
1209
1210static void hevc_vt_8t_16multx4mult_msa(uint8_t *src,
1211                                        int32_t src_stride,
1212                                        int16_t *dst,
1213                                        int32_t dst_stride,
1214                                        const int8_t *filter,
1215                                        int32_t height,
1216                                        int32_t width)
1217{
1218    uint8_t *src_tmp;
1219    int16_t *dst_tmp;
1220    int32_t loop_cnt, cnt;
1221    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1222    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1223    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1224    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1225    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1226    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1227    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1228    v8i16 filter_vec, const_vec;
1229    v8i16 filt0, filt1, filt2, filt3;
1230
1231    src -= (3 * src_stride);
1232    const_vec = __msa_ldi_h(128);
1233    const_vec <<= 6;
1234
1235    filter_vec = LD_SH(filter);
1236    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1237
1238    for (cnt = width >> 4; cnt--;) {
1239        src_tmp = src;
1240        dst_tmp = dst;
1241
1242        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1243        src_tmp += (7 * src_stride);
1244        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1245        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1246                   src10_r, src32_r, src54_r, src21_r);
1247        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1248        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1249                   src10_l, src32_l, src54_l, src21_l);
1250        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1251
1252        for (loop_cnt = (height >> 2); loop_cnt--;) {
1253            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1254            src_tmp += (4 * src_stride);
1255            XORI_B4_128_SB(src7, src8, src9, src10);
1256            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1257                       src76_r, src87_r, src98_r, src109_r);
1258            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1259                       src76_l, src87_l, src98_l, src109_l);
1260
1261            dst0_r = const_vec;
1262            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1263                         filt0, filt1, filt2, filt3,
1264                         dst0_r, dst0_r, dst0_r, dst0_r);
1265            dst1_r = const_vec;
1266            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1267                         filt0, filt1, filt2, filt3,
1268                         dst1_r, dst1_r, dst1_r, dst1_r);
1269            dst2_r = const_vec;
1270            DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1271                         filt0, filt1, filt2, filt3,
1272                         dst2_r, dst2_r, dst2_r, dst2_r);
1273            dst3_r = const_vec;
1274            DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1275                         filt0, filt1, filt2, filt3,
1276                         dst3_r, dst3_r, dst3_r, dst3_r);
1277            dst0_l = const_vec;
1278            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1279                         filt0, filt1, filt2, filt3,
1280                         dst0_l, dst0_l, dst0_l, dst0_l);
1281            dst1_l = const_vec;
1282            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1283                         filt0, filt1, filt2, filt3,
1284                         dst1_l, dst1_l, dst1_l, dst1_l);
1285            dst2_l = const_vec;
1286            DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l,
1287                         filt0, filt1, filt2, filt3,
1288                         dst2_l, dst2_l, dst2_l, dst2_l);
1289            dst3_l = const_vec;
1290            DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l,
1291                         filt0, filt1, filt2, filt3,
1292                         dst3_l, dst3_l, dst3_l, dst3_l);
1293
1294            ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1295            ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1296            dst_tmp += (4 * dst_stride);
1297
1298            src10_r = src54_r;
1299            src32_r = src76_r;
1300            src54_r = src98_r;
1301            src21_r = src65_r;
1302            src43_r = src87_r;
1303            src65_r = src109_r;
1304            src10_l = src54_l;
1305            src32_l = src76_l;
1306            src54_l = src98_l;
1307            src21_l = src65_l;
1308            src43_l = src87_l;
1309            src65_l = src109_l;
1310            src6 = src10;
1311        }
1312
1313        src += 16;
1314        dst += 16;
1315    }
1316}
1317
1318static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1319                               int16_t *dst, int32_t dst_stride,
1320                               const int8_t *filter, int32_t height)
1321{
1322    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1323                                filter, height, 16);
1324}
1325
1326static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1327                               int16_t *dst, int32_t dst_stride,
1328                               const int8_t *filter, int32_t height)
1329{
1330    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1331                                filter, height, 16);
1332    hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1333                      filter, height);
1334}
1335
1336static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1337                               int16_t *dst, int32_t dst_stride,
1338                               const int8_t *filter, int32_t height)
1339{
1340    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1341                                filter, height, 32);
1342}
1343
1344static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1345                               int16_t *dst, int32_t dst_stride,
1346                               const int8_t *filter, int32_t height)
1347{
1348    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1349                                filter, height, 48);
1350}
1351
1352static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1353                               int16_t *dst, int32_t dst_stride,
1354                               const int8_t *filter, int32_t height)
1355{
1356    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1357                                filter, height, 64);
1358}
1359
1360static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
1361                              int16_t *dst, int32_t dst_stride,
1362                              const int8_t *filter_x, const int8_t *filter_y,
1363                              int32_t height)
1364{
1365    uint32_t loop_cnt;
1366    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1367    v8i16 filt0, filt1, filt2, filt3;
1368    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1369    v16i8 mask1, mask2, mask3;
1370    v8i16 filter_vec, const_vec;
1371    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1372    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1373    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1374    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1375    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1376    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1377    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1378
1379    src -= ((3 * src_stride) + 3);
1380    filter_vec = LD_SH(filter_x);
1381    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1382
1383    filter_vec = LD_SH(filter_y);
1384    UNPCK_R_SB_SH(filter_vec, filter_vec);
1385
1386    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1387
1388    mask1 = mask0 + 2;
1389    mask2 = mask0 + 4;
1390    mask3 = mask0 + 6;
1391
1392    const_vec = __msa_ldi_h(128);
1393    const_vec <<= 6;
1394
1395    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1396    src += (7 * src_stride);
1397    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1398
1399    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1400    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1401    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1402               vec8, vec9, vec10, vec11);
1403    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1404               vec12, vec13, vec14, vec15);
1405    dst30 = const_vec;
1406    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1407                 dst30, dst30, dst30, dst30);
1408    dst41 = const_vec;
1409    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1410                 dst41, dst41, dst41, dst41);
1411    dst52 = const_vec;
1412    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1413                 dst52, dst52, dst52, dst52);
1414    dst63 = const_vec;
1415    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1416                 dst63, dst63, dst63, dst63);
1417
1418    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1419    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1420    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1421    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1422
1423    for (loop_cnt = height >> 2; loop_cnt--;) {
1424        LD_SB4(src, src_stride, src7, src8, src9, src10);
1425        src += (4 * src_stride);
1426        XORI_B4_128_SB(src7, src8, src9, src10);
1427
1428        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1429                   vec0, vec1, vec2, vec3);
1430        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1431                   vec4, vec5, vec6, vec7);
1432        dst97 = const_vec;
1433        dst108 = const_vec;
1434        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1435                     dst97, dst97, dst97, dst97);
1436        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1437                     dst108, dst108, dst108, dst108);
1438
1439        dst76_r = __msa_ilvr_h(dst97, dst66);
1440        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1441        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1442        dst98_r = __msa_ilvr_h(dst66, dst108);
1443
1444        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1445                                filt_h0, filt_h1, filt_h2, filt_h3);
1446        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1447                                filt_h0, filt_h1, filt_h2, filt_h3);
1448        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r,
1449                                filt_h0, filt_h1, filt_h2, filt_h3);
1450        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r,
1451                                filt_h0, filt_h1, filt_h2, filt_h3);
1452        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1453        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1454        ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
1455        dst += (4 * dst_stride);
1456
1457        dst10_r = dst54_r;
1458        dst32_r = dst76_r;
1459        dst54_r = dst98_r;
1460        dst21_r = dst65_r;
1461        dst43_r = dst87_r;
1462        dst65_r = dst109_r;
1463        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1464    }
1465}
1466
1467static void hevc_hv_8t_8multx1mult_msa(uint8_t *src,
1468                                       int32_t src_stride,
1469                                       int16_t *dst,
1470                                       int32_t dst_stride,
1471                                       const int8_t *filter_x,
1472                                       const int8_t *filter_y,
1473                                       int32_t height, int32_t width)
1474{
1475    uint32_t loop_cnt, cnt;
1476    uint8_t *src_tmp;
1477    int16_t *dst_tmp;
1478    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1479    v8i16 filt0, filt1, filt2, filt3;
1480    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1481    v16i8 mask1, mask2, mask3;
1482    v8i16 filter_vec, const_vec;
1483    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1484    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1485    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1486    v4i32 dst0_r, dst0_l;
1487    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1488    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1489    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1490
1491    src -= ((3 * src_stride) + 3);
1492    filter_vec = LD_SH(filter_x);
1493    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1494
1495    filter_vec = LD_SH(filter_y);
1496    UNPCK_R_SB_SH(filter_vec, filter_vec);
1497
1498    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1499
1500    mask1 = mask0 + 2;
1501    mask2 = mask0 + 4;
1502    mask3 = mask0 + 6;
1503
1504    const_vec = __msa_ldi_h(128);
1505    const_vec <<= 6;
1506
1507    for (cnt = width >> 3; cnt--;) {
1508        src_tmp = src;
1509        dst_tmp = dst;
1510
1511        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1512        src_tmp += (7 * src_stride);
1513        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1514
1515        /* row 0 row 1 row 2 row 3 */
1516        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1517                   vec0, vec1, vec2, vec3);
1518        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1519                   vec4, vec5, vec6, vec7);
1520        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1521                   vec8, vec9, vec10, vec11);
1522        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1523                   vec12, vec13, vec14, vec15);
1524        dst0 = const_vec;
1525        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1526                     dst0, dst0, dst0, dst0);
1527        dst1 = const_vec;
1528        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1529                     dst1, dst1, dst1, dst1);
1530        dst2 = const_vec;
1531        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1532                     dst2, dst2, dst2, dst2);
1533        dst3 = const_vec;
1534        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1535                     dst3, dst3, dst3, dst3);
1536
1537        /* row 4 row 5 row 6 */
1538        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1539                   vec0, vec1, vec2, vec3);
1540        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1541                   vec4, vec5, vec6, vec7);
1542        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1543                   vec8, vec9, vec10, vec11);
1544        dst4 = const_vec;
1545        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1546                     dst4, dst4, dst4, dst4);
1547        dst5 = const_vec;
1548        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1549                     dst5, dst5, dst5, dst5);
1550        dst6 = const_vec;
1551        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1552                     dst6, dst6, dst6, dst6);
1553
1554        for (loop_cnt = height; loop_cnt--;) {
1555            src7 = LD_SB(src_tmp);
1556            src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1557            src_tmp += src_stride;
1558
1559            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1560                       vec0, vec1, vec2, vec3);
1561            dst7 = const_vec;
1562            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1563                         dst7, dst7, dst7, dst7);
1564
1565            ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1566            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1567            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1568            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1569            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1570                                    filt_h0, filt_h1, filt_h2, filt_h3);
1571            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1572                                    filt_h0, filt_h1, filt_h2, filt_h3);
1573            dst0_r >>= 6;
1574            dst0_l >>= 6;
1575
1576            dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1577            ST_SW(dst0_r, dst_tmp);
1578            dst_tmp += dst_stride;
1579
1580            dst0 = dst1;
1581            dst1 = dst2;
1582            dst2 = dst3;
1583            dst3 = dst4;
1584            dst4 = dst5;
1585            dst5 = dst6;
1586            dst6 = dst7;
1587        }
1588
1589        src += 8;
1590        dst += 8;
1591    }
1592}
1593
1594static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride,
1595                              int16_t *dst, int32_t dst_stride,
1596                              const int8_t *filter_x, const int8_t *filter_y,
1597                              int32_t height)
1598{
1599    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1600                               filter_x, filter_y, height, 8);
1601}
1602
1603static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
1604                               int16_t *dst, int32_t dst_stride,
1605                               const int8_t *filter_x, const int8_t *filter_y,
1606                               int32_t height)
1607{
1608    uint32_t loop_cnt;
1609    uint8_t *src_tmp;
1610    int16_t *dst_tmp;
1611    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1612    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1613    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1614    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1615    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1616    v8i16 filter_vec, const_vec;
1617    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1618    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1619    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
1620    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
1621    v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
1622
1623    src -= ((3 * src_stride) + 3);
1624    filter_vec = LD_SH(filter_x);
1625    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1626
1627    filter_vec = LD_SH(filter_y);
1628    UNPCK_R_SB_SH(filter_vec, filter_vec);
1629
1630    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1631
1632    mask0 = LD_SB(ff_hevc_mask_arr);
1633    mask1 = mask0 + 2;
1634    mask2 = mask0 + 4;
1635    mask3 = mask0 + 6;
1636
1637    const_vec = __msa_ldi_h(128);
1638    const_vec <<= 6;
1639
1640    src_tmp = src;
1641    dst_tmp = dst;
1642
1643    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1644    src_tmp += (7 * src_stride);
1645    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1646
1647    /* row 0 row 1 row 2 row 3 */
1648    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1649    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1650    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1651               vec11);
1652    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1653               vec15);
1654    dst0 = const_vec;
1655    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0,
1656                 dst0, dst0);
1657    dst1 = const_vec;
1658    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1,
1659                 dst1, dst1);
1660    dst2 = const_vec;
1661    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2,
1662                 dst2, dst2, dst2);
1663    dst3 = const_vec;
1664    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3,
1665                 dst3, dst3, dst3);
1666
1667    /* row 4 row 5 row 6 */
1668    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1669    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1670    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1671               vec11);
1672    dst4 = const_vec;
1673    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4,
1674                 dst4, dst4);
1675    dst5 = const_vec;
1676    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5,
1677                 dst5, dst5);
1678    dst6 = const_vec;
1679    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6,
1680                 dst6, dst6, dst6);
1681
1682    for (loop_cnt = height; loop_cnt--;) {
1683        src7 = LD_SB(src_tmp);
1684        src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1685        src_tmp += src_stride;
1686
1687        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1688                   vec3);
1689        dst7 = const_vec;
1690        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7,
1691                     dst7, dst7, dst7);
1692
1693        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1694        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1695        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1696        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1697        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1698                                filt_h1, filt_h2, filt_h3);
1699        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1700                                filt_h1, filt_h2, filt_h3);
1701        dst0_r >>= 6;
1702        dst0_l >>= 6;
1703
1704        dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1705        ST_SW(dst0_r, dst_tmp);
1706        dst_tmp += dst_stride;
1707
1708        dst0 = dst1;
1709        dst1 = dst2;
1710        dst2 = dst3;
1711        dst3 = dst4;
1712        dst4 = dst5;
1713        dst5 = dst6;
1714        dst6 = dst7;
1715    }
1716
1717    src += 8;
1718    dst += 8;
1719
1720    mask4 = LD_SB(ff_hevc_mask_arr + 16);
1721    mask5 = mask4 + 2;
1722    mask6 = mask4 + 4;
1723    mask7 = mask4 + 6;
1724
1725    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1726    src += (7 * src_stride);
1727    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1728
1729    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1730    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1731    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1732               vec11);
1733    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1734               vec15);
1735    dst30 = const_vec;
1736    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30,
1737                 dst30, dst30, dst30);
1738    dst41 = const_vec;
1739    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41,
1740                 dst41, dst41, dst41);
1741    dst52 = const_vec;
1742    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52,
1743                 dst52, dst52, dst52);
1744    dst63 = const_vec;
1745    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63,
1746                 dst63, dst63, dst63);
1747
1748    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1749    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1750    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1751
1752    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1753
1754    for (loop_cnt = height >> 2; loop_cnt--;) {
1755        LD_SB4(src, src_stride, src7, src8, src9, src10);
1756        src += (4 * src_stride);
1757        XORI_B4_128_SB(src7, src8, src9, src10);
1758
1759        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1760                   vec3);
1761        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1762                   vec7);
1763        dst97 = const_vec;
1764        dst108 = const_vec;
1765        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97,
1766                     dst97, dst97, dst97);
1767        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108,
1768                     dst108, dst108, dst108);
1769
1770        dst76_r = __msa_ilvr_h(dst97, dst66);
1771        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1772        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1773        dst98_r = __msa_ilvr_h(dst66, dst108);
1774
1775        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1776                                filt_h1, filt_h2, filt_h3);
1777        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1778                                filt_h1, filt_h2, filt_h3);
1779        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1780                                filt_h1, filt_h2, filt_h3);
1781        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1782                                filt_h1, filt_h2, filt_h3);
1783        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1784        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1785        ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
1786        dst += (4 * dst_stride);
1787
1788        dst10_r = dst54_r;
1789        dst32_r = dst76_r;
1790        dst54_r = dst98_r;
1791        dst21_r = dst65_r;
1792        dst43_r = dst87_r;
1793        dst65_r = dst109_r;
1794        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1795    }
1796}
1797
1798static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride,
1799                               int16_t *dst, int32_t dst_stride,
1800                               const int8_t *filter_x, const int8_t *filter_y,
1801                               int32_t height)
1802{
1803    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1804                               filter_x, filter_y, height, 16);
1805}
1806
1807static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride,
1808                               int16_t *dst, int32_t dst_stride,
1809                               const int8_t *filter_x, const int8_t *filter_y,
1810                               int32_t height)
1811{
1812    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1813                               filter_x, filter_y, height, 24);
1814}
1815
1816static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride,
1817                               int16_t *dst, int32_t dst_stride,
1818                               const int8_t *filter_x, const int8_t *filter_y,
1819                               int32_t height)
1820{
1821    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1822                               filter_x, filter_y, height, 32);
1823}
1824
1825static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride,
1826                               int16_t *dst, int32_t dst_stride,
1827                               const int8_t *filter_x, const int8_t *filter_y,
1828                               int32_t height)
1829{
1830    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1831                               filter_x, filter_y, height, 48);
1832}
1833
1834static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride,
1835                               int16_t *dst, int32_t dst_stride,
1836                               const int8_t *filter_x, const int8_t *filter_y,
1837                               int32_t height)
1838{
1839    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1840                               filter_x, filter_y, height, 64);
1841}
1842
1843static void hevc_hz_4t_4x2_msa(uint8_t *src,
1844                               int32_t src_stride,
1845                               int16_t *dst,
1846                               int32_t dst_stride,
1847                               const int8_t *filter)
1848{
1849    v8i16 filt0, filt1;
1850    v16i8 src0, src1;
1851    v16i8 mask1, vec0, vec1;
1852    v8i16 dst0;
1853    v8i16 filter_vec, const_vec;
1854    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1855
1856    src -= 1;
1857
1858    filter_vec = LD_SH(filter);
1859    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1860
1861    mask1 = mask0 + 2;
1862
1863    const_vec = __msa_ldi_h(128);
1864    const_vec <<= 6;
1865
1866    LD_SB2(src, src_stride, src0, src1);
1867    XORI_B2_128_SB(src0, src1);
1868
1869    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1870    dst0 = const_vec;
1871    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1872
1873    ST_D2(dst0, 0, 1, dst, dst_stride);
1874}
1875
1876static void hevc_hz_4t_4x4_msa(uint8_t *src,
1877                               int32_t src_stride,
1878                               int16_t *dst,
1879                               int32_t dst_stride,
1880                               const int8_t *filter)
1881{
1882    v8i16 filt0, filt1;
1883    v16i8 src0, src1, src2, src3;
1884    v16i8 mask1, vec0, vec1;
1885    v8i16 dst0, dst1;
1886    v8i16 filter_vec, const_vec;
1887    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1888
1889    src -= 1;
1890
1891    filter_vec = LD_SH(filter);
1892    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1893
1894    mask1 = mask0 + 2;
1895
1896    const_vec = __msa_ldi_h(128);
1897    const_vec <<= 6;
1898
1899    LD_SB4(src, src_stride, src0, src1, src2, src3);
1900    XORI_B4_128_SB(src0, src1, src2, src3);
1901
1902    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1903    dst0 = const_vec;
1904    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1905
1906    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1907    dst1 = const_vec;
1908    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1909
1910    ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1911}
1912
1913static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
1914                                       int32_t src_stride,
1915                                       int16_t *dst,
1916                                       int32_t dst_stride,
1917                                       const int8_t *filter,
1918                                       int32_t height)
1919{
1920    uint32_t loop_cnt;
1921    v8i16 filt0, filt1;
1922    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1923    v16i8 mask1, vec0, vec1;
1924    v8i16 dst0, dst1, dst2, dst3;
1925    v8i16 filter_vec, const_vec;
1926    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1927
1928    src -= 1;
1929
1930    filter_vec = LD_SH(filter);
1931    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1932
1933    mask1 = mask0 + 2;
1934
1935    const_vec = __msa_ldi_h(128);
1936    const_vec <<= 6;
1937
1938    for (loop_cnt = (height >> 3); loop_cnt--;) {
1939        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1940        src += (8 * src_stride);
1941
1942        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1943
1944        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1945        dst0 = const_vec;
1946        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1947        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1948        dst1 = const_vec;
1949        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1950        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1951        dst2 = const_vec;
1952        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
1953        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1954        dst3 = const_vec;
1955        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
1956
1957        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
1958        dst += (8 * dst_stride);
1959    }
1960}
1961
1962static void hevc_hz_4t_4w_msa(uint8_t *src,
1963                              int32_t src_stride,
1964                              int16_t *dst,
1965                              int32_t dst_stride,
1966                              const int8_t *filter,
1967                              int32_t height)
1968{
1969    if (2 == height) {
1970        hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
1971    } else if (4 == height) {
1972        hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1973    } else if (0 == height % 8) {
1974        hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
1975                                   filter, height);
1976    }
1977}
1978
1979static void hevc_hz_4t_6w_msa(uint8_t *src,
1980                              int32_t src_stride,
1981                              int16_t *dst,
1982                              int32_t dst_stride,
1983                              const int8_t *filter,
1984                              int32_t height)
1985{
1986    uint32_t loop_cnt;
1987    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
1988    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
1989    v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
1990    v16i8 src0, src1, src2, src3;
1991    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1992    v16i8 mask1;
1993    v16i8 vec0, vec1;
1994    v8i16 filter_vec, const_vec;
1995
1996    src -= 1;
1997
1998    filter_vec = LD_SH(filter);
1999    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2000
2001    mask1 = mask0 + 2;
2002
2003    const_vec = __msa_ldi_h(128);
2004    const_vec <<= 6;
2005
2006    for (loop_cnt = 2; loop_cnt--;) {
2007        LD_SB4(src, src_stride, src0, src1, src2, src3);
2008        src += (4 * src_stride);
2009
2010        XORI_B4_128_SB(src0, src1, src2, src3);
2011
2012        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2013        dst0 = const_vec;
2014        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2015        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2016        dst1 = const_vec;
2017        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2018        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2019        dst2 = const_vec;
2020        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2021        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2022        dst3 = const_vec;
2023        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2024
2025        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2026        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
2027        dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
2028        dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
2029
2030        dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
2031        dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
2032        dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
2033        dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
2034
2035        SD(dst_val0, dst);
2036        SW(dst_val_int0, dst + 4);
2037        dst += dst_stride;
2038        SD(dst_val1, dst);
2039        SW(dst_val_int1, dst + 4);
2040        dst += dst_stride;
2041        SD(dst_val2, dst);
2042        SW(dst_val_int2, dst + 4);
2043        dst += dst_stride;
2044        SD(dst_val3, dst);
2045        SW(dst_val_int3, dst + 4);
2046        dst += dst_stride;
2047    }
2048}
2049
2050static void hevc_hz_4t_8x2multiple_msa(uint8_t *src,
2051                                       int32_t src_stride,
2052                                       int16_t *dst,
2053                                       int32_t dst_stride,
2054                                       const int8_t *filter,
2055                                       int32_t height)
2056{
2057    uint32_t loop_cnt;
2058    v8i16 filt0, filt1, dst0, dst1;
2059    v16i8 src0, src1;
2060    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2061    v16i8 mask1;
2062    v16i8 vec0, vec1;
2063    v8i16 filter_vec, const_vec;
2064
2065    src -= 1;
2066
2067    filter_vec = LD_SH(filter);
2068    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2069
2070    mask1 = mask0 + 2;
2071
2072    const_vec = __msa_ldi_h(128);
2073    const_vec <<= 6;
2074
2075    for (loop_cnt = (height >> 1); loop_cnt--;) {
2076        LD_SB2(src, src_stride, src0, src1);
2077        src += (2 * src_stride);
2078
2079        XORI_B2_128_SB(src0, src1);
2080
2081        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2082        dst0 = const_vec;
2083        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2084
2085        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2086        dst1 = const_vec;
2087        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2088
2089        ST_SH2(dst0, dst1, dst, dst_stride);
2090        dst += (2 * dst_stride);
2091    }
2092}
2093
2094static void hevc_hz_4t_8x4multiple_msa(uint8_t *src,
2095                                       int32_t src_stride,
2096                                       int16_t *dst,
2097                                       int32_t dst_stride,
2098                                       const int8_t *filter,
2099                                       int32_t height)
2100{
2101    uint32_t loop_cnt;
2102    v8i16 filt0, filt1;
2103    v16i8 src0, src1, src2, src3;
2104    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2105    v16i8 mask1;
2106    v16i8 vec0, vec1;
2107    v8i16 dst0, dst1, dst2, dst3;
2108    v8i16 filter_vec, const_vec;
2109
2110    src -= 1;
2111
2112    filter_vec = LD_SH(filter);
2113    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2114
2115    mask1 = mask0 + 2;
2116
2117    const_vec = __msa_ldi_h(128);
2118    const_vec <<= 6;
2119
2120    for (loop_cnt = (height >> 2); loop_cnt--;) {
2121        LD_SB4(src, src_stride, src0, src1, src2, src3);
2122        src += (4 * src_stride);
2123
2124        XORI_B4_128_SB(src0, src1, src2, src3);
2125
2126        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2127        dst0 = const_vec;
2128        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2129
2130        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2131        dst1 = const_vec;
2132        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2133
2134        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2135        dst2 = const_vec;
2136        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2137
2138        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2139        dst3 = const_vec;
2140        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2141
2142        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2143        dst += (4 * dst_stride);
2144    }
2145}
2146
2147static void hevc_hz_4t_8w_msa(uint8_t *src,
2148                              int32_t src_stride,
2149                              int16_t *dst,
2150                              int32_t dst_stride,
2151                              const int8_t *filter,
2152                              int32_t height)
2153{
2154    if (2 == height || 6 == height) {
2155        hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride,
2156                                   filter, height);
2157    } else {
2158        hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2159                                   filter, height);
2160    }
2161}
2162
2163static void hevc_hz_4t_12w_msa(uint8_t *src,
2164                               int32_t src_stride,
2165                               int16_t *dst,
2166                               int32_t dst_stride,
2167                               const int8_t *filter,
2168                               int32_t height)
2169{
2170    uint32_t loop_cnt;
2171    v8i16 filt0, filt1;
2172    v16i8 src0, src1, src2, src3;
2173    v16i8 mask1;
2174    v16i8 vec0, vec1;
2175    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2176    v8i16 filter_vec, const_vec;
2177    v16i8 mask3;
2178    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2179    v16i8 mask2 = {
2180        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2181    };
2182
2183    src -= 1;
2184
2185    filter_vec = LD_SH(filter);
2186    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2187
2188    mask1 = mask0 + 2;
2189    mask3 = mask2 + 2;
2190
2191    const_vec = __msa_ldi_h(128);
2192    const_vec <<= 6;
2193
2194    for (loop_cnt = (height >> 2); loop_cnt--;) {
2195        LD_SB4(src, src_stride, src0, src1, src2, src3);
2196        src += (4 * src_stride);
2197        XORI_B4_128_SB(src0, src1, src2, src3);
2198
2199        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2200        dst0 = const_vec;
2201        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2202        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2203        dst1 = const_vec;
2204        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2205        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2206        dst2 = const_vec;
2207        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2208        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2209        dst3 = const_vec;
2210        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2211        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2212        dst4 = const_vec;
2213        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2214        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2215        dst5 = const_vec;
2216        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2217
2218        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2219        ST_D4(dst4, dst5, 0, 1, 0, 1, dst + 8, dst_stride);
2220        dst += (4 * dst_stride);
2221    }
2222}
2223
2224static void hevc_hz_4t_16w_msa(uint8_t *src,
2225                               int32_t src_stride,
2226                               int16_t *dst,
2227                               int32_t dst_stride,
2228                               const int8_t *filter,
2229                               int32_t height)
2230{
2231    uint32_t loop_cnt;
2232    v16i8 src0, src1, src2, src3;
2233    v16i8 src4, src5, src6, src7;
2234    v8i16 filt0, filt1;
2235    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2236    v16i8 mask1;
2237    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2238    v16i8 vec0, vec1;
2239    v8i16 filter_vec, const_vec;
2240
2241    src -= 1;
2242
2243    filter_vec = LD_SH(filter);
2244    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2245
2246    mask1 = mask0 + 2;
2247
2248    const_vec = __msa_ldi_h(128);
2249    const_vec <<= 6;
2250
2251    for (loop_cnt = (height >> 2); loop_cnt--;) {
2252        LD_SB4(src, src_stride, src0, src2, src4, src6);
2253        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2254        src += (4 * src_stride);
2255
2256        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2257
2258        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2259        dst0 = const_vec;
2260        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2261
2262        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2263        dst1 = const_vec;
2264        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2265
2266        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2267        dst2 = const_vec;
2268        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2269
2270        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2271        dst3 = const_vec;
2272        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2273
2274        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2275        dst4 = const_vec;
2276        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2277
2278        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2279        dst5 = const_vec;
2280        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2281
2282        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2283        dst6 = const_vec;
2284        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2285
2286        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2287        dst7 = const_vec;
2288        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2289
2290        ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
2291        ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
2292        dst += (4 * dst_stride);
2293    }
2294}
2295
2296static void hevc_hz_4t_24w_msa(uint8_t *src,
2297                               int32_t src_stride,
2298                               int16_t *dst,
2299                               int32_t dst_stride,
2300                               const int8_t *filter,
2301                               int32_t height)
2302{
2303    uint32_t loop_cnt;
2304    int16_t *dst_tmp = dst + 16;
2305    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2306    v8i16 filt0, filt1;
2307    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2308    v16i8 mask1, mask00, mask11;
2309    v16i8 vec0, vec1;
2310    v8i16 dst0, dst1, dst2, dst3;
2311    v8i16 filter_vec, const_vec;
2312
2313    src -= 1;
2314
2315    filter_vec = LD_SH(filter);
2316    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2317
2318    mask1 = mask0 + 2;
2319    mask00 = mask0 + 8;
2320    mask11 = mask0 + 10;
2321
2322    const_vec = __msa_ldi_h(128);
2323    const_vec <<= 6;
2324
2325    for (loop_cnt = (height >> 2); loop_cnt--;) {
2326        /* 16 width */
2327        LD_SB4(src, src_stride, src0, src2, src4, src6);
2328        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2329        src += (4 * src_stride);
2330
2331        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2332
2333        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2334        dst0 = const_vec;
2335        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2336
2337        VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
2338        dst1 = const_vec;
2339        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2340
2341        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2342        dst2 = const_vec;
2343        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2344
2345        VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
2346        dst3 = const_vec;
2347        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2348
2349        ST_SH2(dst0, dst1, dst, 8);
2350        dst += dst_stride;
2351        ST_SH2(dst2, dst3, dst, 8);
2352        dst += dst_stride;
2353
2354        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2355        dst0 = const_vec;
2356        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2357
2358        VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2359        dst1 = const_vec;
2360        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2361
2362        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2363        dst2 = const_vec;
2364        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2365
2366        VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2367        dst3 = const_vec;
2368        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2369
2370        ST_SH2(dst0, dst1, dst, 8);
2371        dst += dst_stride;
2372        ST_SH2(dst2, dst3, dst, 8);
2373        dst += dst_stride;
2374
2375        /* 8 width */
2376        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2377        dst0 = const_vec;
2378        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2379
2380        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2381        dst1 = const_vec;
2382        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2383
2384        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2385        dst2 = const_vec;
2386        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2387
2388        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2389        dst3 = const_vec;
2390        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2391
2392        ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2393        dst_tmp += (4 * dst_stride);
2394    }
2395}
2396
2397static void hevc_hz_4t_32w_msa(uint8_t *src,
2398                               int32_t src_stride,
2399                               int16_t *dst,
2400                               int32_t dst_stride,
2401                               const int8_t *filter,
2402                               int32_t height)
2403{
2404    uint32_t loop_cnt;
2405    v16i8 src0, src1, src2;
2406    v8i16 filt0, filt1;
2407    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2408    v16i8 mask1, mask2, mask3;
2409    v8i16 dst0, dst1, dst2, dst3;
2410    v16i8 vec0, vec1, vec2, vec3;
2411    v8i16 filter_vec, const_vec;
2412
2413    src -= 1;
2414
2415    filter_vec = LD_SH(filter);
2416    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2417
2418    const_vec = __msa_ldi_h(128);
2419    const_vec <<= 6;
2420
2421    mask1 = mask0 + 2;
2422    mask2 = mask0 + 8;
2423    mask3 = mask0 + 10;
2424
2425    for (loop_cnt = height; loop_cnt--;) {
2426        LD_SB2(src, 16, src0, src1);
2427        src2 = LD_SB(src + 24);
2428        src += src_stride;
2429
2430        XORI_B3_128_SB(src0, src1, src2);
2431
2432        dst0 = const_vec;
2433        dst1 = const_vec;
2434        dst2 = const_vec;
2435        dst3 = const_vec;
2436        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2437        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2438        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2439                     dst1, dst2, dst3);
2440        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2441        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2442        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2443                     dst1, dst2, dst3);
2444        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2445        dst += dst_stride;
2446    }
2447}
2448
2449static void hevc_vt_4t_4x2_msa(uint8_t *src,
2450                               int32_t src_stride,
2451                               int16_t *dst,
2452                               int32_t dst_stride,
2453                               const int8_t *filter)
2454{
2455    v16i8 src0, src1, src2, src3, src4;
2456    v16i8 src10_r, src32_r, src21_r, src43_r;
2457    v16i8 src2110, src4332;
2458    v8i16 dst10;
2459    v8i16 filt0, filt1;
2460    v8i16 filter_vec, const_vec;
2461
2462    src -= src_stride;
2463
2464    const_vec = __msa_ldi_h(128);
2465    const_vec <<= 6;
2466
2467    filter_vec = LD_SH(filter);
2468    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2469
2470    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2471    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2472               src10_r, src21_r, src32_r, src43_r);
2473
2474    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2475    XORI_B2_128_SB(src2110, src4332);
2476    dst10 = const_vec;
2477    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2478
2479    ST_D2(dst10, 0, 1, dst, dst_stride);
2480}
2481
2482static void hevc_vt_4t_4x4_msa(uint8_t *src,
2483                               int32_t src_stride,
2484                               int16_t *dst,
2485                               int32_t dst_stride,
2486                               const int8_t *filter,
2487                               int32_t height)
2488{
2489    v16i8 src0, src1, src2, src3, src4, src5, src6;
2490    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2491    v16i8 src2110, src4332, src6554;
2492    v8i16 dst10, dst32;
2493    v8i16 filt0, filt1;
2494    v8i16 filter_vec, const_vec;
2495
2496    src -= src_stride;
2497
2498    const_vec = __msa_ldi_h(128);
2499    const_vec <<= 6;
2500
2501    filter_vec = LD_SH(filter);
2502    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2503
2504    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2505    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2506               src10_r, src21_r, src32_r, src43_r);
2507    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2508    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2509               src2110, src4332, src6554);
2510    XORI_B3_128_SB(src2110, src4332, src6554);
2511    dst10 = const_vec;
2512    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2513    dst32 = const_vec;
2514    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2515
2516    ST_D4(dst10, dst32, 0, 1, 0, 1, dst, dst_stride);
2517}
2518
2519static void hevc_vt_4t_4x8_msa(uint8_t *src,
2520                               int32_t src_stride,
2521                               int16_t *dst,
2522                               int32_t dst_stride,
2523                               const int8_t *filter,
2524                               int32_t height)
2525{
2526    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2527    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2528    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2529    v16i8 src2110, src4332, src6554, src8776, src10998;
2530    v8i16 dst10, dst32, dst54, dst76;
2531    v8i16 filt0, filt1;
2532    v8i16 filter_vec, const_vec;
2533
2534    src -= src_stride;
2535    const_vec = __msa_ldi_h(128);
2536    const_vec <<= 6;
2537
2538    filter_vec = LD_SH(filter);
2539    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2540
2541    LD_SB3(src, src_stride, src0, src1, src2);
2542    src += (3 * src_stride);
2543
2544    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2545    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2546    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2547
2548    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2549    src += (8 * src_stride);
2550    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2551               src32_r, src43_r, src54_r, src65_r);
2552    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
2553               src76_r, src87_r, src98_r, src109_r);
2554    ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2555               src98_r, src4332, src6554, src8776, src10998);
2556    XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2557    dst10 = const_vec;
2558    dst32 = const_vec;
2559    dst54 = const_vec;
2560    dst76 = const_vec;
2561    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2562    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2563    DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2564    DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2565    ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2566}
2567
2568static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2569                                int16_t *dst, int32_t dst_stride,
2570                                const int8_t *filter, int32_t height)
2571{
2572    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2573    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
2574    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
2575    v16i8 src10998;
2576    v8i16 dst10, dst32, dst54, dst76, filt0, filt1, filter_vec, const_vec;
2577
2578    src -= src_stride;
2579    const_vec = __msa_ldi_h(128);
2580    const_vec <<= 6;
2581
2582    filter_vec = LD_SH(filter);
2583    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2584
2585    LD_SB3(src, src_stride, src0, src1, src2);
2586    src += (3 * src_stride);
2587
2588    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2589    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2590    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2591
2592    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2593    src += (8 * src_stride);
2594    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2595               src54_r, src65_r);
2596    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2597               src87_r, src98_r, src109_r);
2598    ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2599               src98_r, src4332, src6554, src8776, src10998);
2600    XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2601
2602    dst10 = const_vec;
2603    dst32 = const_vec;
2604    dst54 = const_vec;
2605    dst76 = const_vec;
2606    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2607    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2608    DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2609    DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2610    ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2611    dst += (8 * dst_stride);
2612
2613    src2 = src10;
2614    src2110 = src10998;
2615
2616    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2617    src += (8 * src_stride);
2618
2619    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2620               src54_r, src65_r);
2621    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2622               src87_r, src98_r, src109_r);
2623    ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2624               src98_r, src4332, src6554, src8776, src10998);
2625    XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2626
2627    dst10 = const_vec;
2628    dst32 = const_vec;
2629    dst54 = const_vec;
2630    dst76 = const_vec;
2631    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2632    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2633    DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2634    DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2635    ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2636}
2637
2638static void hevc_vt_4t_4w_msa(uint8_t *src,
2639                              int32_t src_stride,
2640                              int16_t *dst,
2641                              int32_t dst_stride,
2642                              const int8_t *filter,
2643                              int32_t height)
2644{
2645    if (2 == height) {
2646        hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2647    } else if (4 == height) {
2648        hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height);
2649    } else if (8 == height) {
2650        hevc_vt_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, height);
2651    } else if (16 == height) {
2652        hevc_vt_4t_4x16_msa(src, src_stride, dst, dst_stride, filter, height);
2653    }
2654}
2655
2656static void hevc_vt_4t_6w_msa(uint8_t *src,
2657                              int32_t src_stride,
2658                              int16_t *dst,
2659                              int32_t dst_stride,
2660                              const int8_t *filter,
2661                              int32_t height)
2662{
2663    int32_t loop_cnt;
2664    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2665    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2666    v16i8 src0, src1, src2, src3, src4;
2667    v16i8 src10_r, src32_r, src21_r, src43_r;
2668    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2669    v8i16 filt0, filt1;
2670    v8i16 filter_vec, const_vec;
2671
2672    src -= src_stride;
2673    const_vec = __msa_ldi_h(128);
2674    const_vec <<= 6;
2675
2676    filter_vec = LD_SH(filter);
2677    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2678
2679    LD_SB3(src, src_stride, src0, src1, src2);
2680    src += (3 * src_stride);
2681    XORI_B3_128_SB(src0, src1, src2);
2682    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2683
2684    for (loop_cnt = (height >> 2); loop_cnt--;) {
2685        LD_SB2(src, src_stride, src3, src4);
2686        src += (2 * src_stride);
2687        XORI_B2_128_SB(src3, src4);
2688        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2689
2690        dst0_r = const_vec;
2691        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2692        dst1_r = const_vec;
2693        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2694
2695        LD_SB2(src, src_stride, src1, src2);
2696        src += (2 * src_stride);
2697        XORI_B2_128_SB(src1, src2);
2698        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2699
2700        dst2_r = const_vec;
2701        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2702        dst3_r = const_vec;
2703        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2704
2705        dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2706        dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2707        dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2708        dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2709
2710        dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2711        dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2712        dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2713        dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2714
2715        SD(dst_val0, dst);
2716        SW(dst_val_int0, dst + 4);
2717        dst += dst_stride;
2718        SD(dst_val1, dst);
2719        SW(dst_val_int1, dst + 4);
2720        dst += dst_stride;
2721        SD(dst_val2, dst);
2722        SW(dst_val_int2, dst + 4);
2723        dst += dst_stride;
2724        SD(dst_val3, dst);
2725        SW(dst_val_int3, dst + 4);
2726        dst += dst_stride;
2727    }
2728}
2729
2730static void hevc_vt_4t_8x2_msa(uint8_t *src,
2731                               int32_t src_stride,
2732                               int16_t *dst,
2733                               int32_t dst_stride,
2734                               const int8_t *filter)
2735{
2736    v16i8 src0, src1, src2, src3, src4;
2737    v16i8 src10_r, src32_r, src21_r, src43_r;
2738    v8i16 dst0_r, dst1_r;
2739    v8i16 filt0, filt1;
2740    v8i16 filter_vec, const_vec;
2741
2742    src -= src_stride;
2743    const_vec = __msa_ldi_h(128);
2744    const_vec <<= 6;
2745
2746    filter_vec = LD_SH(filter);
2747    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2748
2749    LD_SB3(src, src_stride, src0, src1, src2);
2750    src += (3 * src_stride);
2751    XORI_B3_128_SB(src0, src1, src2);
2752    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2753
2754    LD_SB2(src, src_stride, src3, src4);
2755    XORI_B2_128_SB(src3, src4);
2756    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2757    dst0_r = const_vec;
2758    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2759    dst1_r = const_vec;
2760    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2761
2762    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2763}
2764
2765static void hevc_vt_4t_8x6_msa(uint8_t *src,
2766                               int32_t src_stride,
2767                               int16_t *dst,
2768                               int32_t dst_stride,
2769                               const int8_t *filter)
2770{
2771    v16i8 src0, src1, src2, src3, src4;
2772    v16i8 src10_r, src32_r, src21_r, src43_r;
2773    v8i16 dst0_r, dst1_r;
2774    v8i16 filt0, filt1;
2775    v8i16 filter_vec, const_vec;
2776
2777    src -= src_stride;
2778    const_vec = __msa_ldi_h(128);
2779    const_vec <<= 6;
2780
2781    filter_vec = LD_SH(filter);
2782    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2783
2784    LD_SB3(src, src_stride, src0, src1, src2);
2785    src += (3 * src_stride);
2786    XORI_B3_128_SB(src0, src1, src2);
2787    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2788
2789    LD_SB2(src, src_stride, src3, src4);
2790    src += (2 * src_stride);
2791    XORI_B2_128_SB(src3, src4);
2792
2793    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794    dst0_r = const_vec;
2795    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2796    dst1_r = const_vec;
2797    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2798
2799    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2800    dst += (2 * dst_stride);
2801
2802    LD_SB2(src, src_stride, src1, src2);
2803    src += (2 * src_stride);
2804    XORI_B2_128_SB(src1, src2);
2805
2806    ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2807    dst0_r = const_vec;
2808    DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2809    dst1_r = const_vec;
2810    DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2811
2812    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2813    dst += (2 * dst_stride);
2814
2815    LD_SB2(src, src_stride, src3, src4);
2816    XORI_B2_128_SB(src3, src4);
2817
2818    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2819    dst0_r = const_vec;
2820    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2821    dst1_r = const_vec;
2822    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2823
2824    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2825}
2826
2827static void hevc_vt_4t_8x4multiple_msa(uint8_t *src,
2828                                       int32_t src_stride,
2829                                       int16_t *dst,
2830                                       int32_t dst_stride,
2831                                       const int8_t *filter,
2832                                       int32_t height)
2833{
2834    int32_t loop_cnt;
2835    v16i8 src0, src1, src2, src3, src4, src5, src6;
2836    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2837    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2838    v8i16 filt0, filt1;
2839    v8i16 filter_vec, const_vec;
2840
2841    src -= src_stride;
2842    const_vec = __msa_ldi_h(128);
2843    const_vec <<= 6;
2844
2845    filter_vec = LD_SH(filter);
2846    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2847
2848    LD_SB3(src, src_stride, src0, src1, src2);
2849    src += (3 * src_stride);
2850    XORI_B3_128_SB(src0, src1, src2);
2851    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2852
2853    for (loop_cnt = (height >> 2); loop_cnt--;) {
2854        LD_SB4(src, src_stride, src3, src4, src5, src6);
2855        src += (4 * src_stride);
2856        XORI_B4_128_SB(src3, src4, src5, src6);
2857        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2858        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2859        dst0_r = const_vec;
2860        dst1_r = const_vec;
2861        dst2_r = const_vec;
2862        dst3_r = const_vec;
2863        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2864        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2865        DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2866        DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2867        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2868        dst += (4 * dst_stride);
2869
2870        src2 = src6;
2871        src10_r = src54_r;
2872        src21_r = src65_r;
2873    }
2874}
2875
2876static void hevc_vt_4t_8w_msa(uint8_t *src,
2877                              int32_t src_stride,
2878                              int16_t *dst,
2879                              int32_t dst_stride,
2880                              const int8_t *filter,
2881                              int32_t height)
2882{
2883    if (2 == height) {
2884        hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2885    } else if (6 == height) {
2886        hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2887    } else {
2888        hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2889                                   filter, height);
2890    }
2891}
2892
2893static void hevc_vt_4t_12w_msa(uint8_t *src,
2894                               int32_t src_stride,
2895                               int16_t *dst,
2896                               int32_t dst_stride,
2897                               const int8_t *filter,
2898                               int32_t height)
2899{
2900    int32_t loop_cnt;
2901    v16i8 src0, src1, src2, src3, src4, src5, src6;
2902    v16i8 src10_r, src32_r, src21_r, src43_r;
2903    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2904    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2905    v16i8 src2110, src4332;
2906    v16i8 src54_r, src65_r, src6554;
2907    v8i16 dst0_l, dst1_l;
2908    v8i16 filt0, filt1;
2909    v8i16 filter_vec, const_vec;
2910
2911    src -= (1 * src_stride);
2912    const_vec = __msa_ldi_h(128);
2913    const_vec <<= 6;
2914
2915    filter_vec = LD_SH(filter);
2916    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2917
2918    LD_SB3(src, src_stride, src0, src1, src2);
2919    src += (3 * src_stride);
2920    XORI_B3_128_SB(src0, src1, src2);
2921    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2922    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2923    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2924
2925    for (loop_cnt = 4; loop_cnt--;) {
2926        LD_SB2(src, src_stride, src3, src4);
2927        src += (2 * src_stride);
2928        LD_SB2(src, src_stride, src5, src6);
2929        src += (2 * src_stride);
2930        XORI_B2_128_SB(src3, src4);
2931        XORI_B2_128_SB(src5, src6);
2932
2933        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2934        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2935        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2936        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2937        ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2938        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2939
2940        dst0_r = const_vec;
2941        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2942        dst1_r = const_vec;
2943        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2944        dst2_r = const_vec;
2945        DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2946        dst3_r = const_vec;
2947        DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2948        dst0_l = const_vec;
2949        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
2950        dst1_l = const_vec;
2951        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
2952
2953        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2954        ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
2955        dst += (4 * dst_stride);
2956
2957        src2 = src6;
2958        src10_r = src54_r;
2959        src21_r = src65_r;
2960        src2110 = src6554;
2961    }
2962}
2963
2964static void hevc_vt_4t_16w_msa(uint8_t *src,
2965                               int32_t src_stride,
2966                               int16_t *dst,
2967                               int32_t dst_stride,
2968                               const int8_t *filter,
2969                               int32_t height)
2970{
2971    int32_t loop_cnt;
2972    v16i8 src0, src1, src2, src3, src4, src5;
2973    v16i8 src10_r, src32_r, src21_r, src43_r;
2974    v16i8 src10_l, src32_l, src21_l, src43_l;
2975    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
2976    v8i16 filt0, filt1;
2977    v8i16 filter_vec, const_vec;
2978
2979    src -= src_stride;
2980    const_vec = __msa_ldi_h(128);
2981    const_vec <<= 6;
2982
2983    filter_vec = LD_SH(filter);
2984    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2985
2986    LD_SB3(src, src_stride, src0, src1, src2);
2987    src += (3 * src_stride);
2988    XORI_B3_128_SB(src0, src1, src2);
2989    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2990    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2991
2992    for (loop_cnt = (height >> 2); loop_cnt--;) {
2993        LD_SB2(src, src_stride, src3, src4);
2994        src += (2 * src_stride);
2995        XORI_B2_128_SB(src3, src4);
2996        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2997        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2998        dst0_r = const_vec;
2999        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3000        dst0_l = const_vec;
3001        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3002        dst1_r = const_vec;
3003        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3004        dst1_l = const_vec;
3005        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3006        ST_SH2(dst0_r, dst0_l, dst, 8);
3007        dst += dst_stride;
3008        ST_SH2(dst1_r, dst1_l, dst, 8);
3009        dst += dst_stride;
3010
3011        LD_SB2(src, src_stride, src5, src2);
3012        src += (2 * src_stride);
3013        XORI_B2_128_SB(src5, src2);
3014        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3015        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3016        dst0_r = const_vec;
3017        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3018        dst0_l = const_vec;
3019        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3020        dst1_r = const_vec;
3021        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3022        dst1_l = const_vec;
3023        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3024        ST_SH2(dst0_r, dst0_l, dst, 8);
3025        dst += dst_stride;
3026        ST_SH2(dst1_r, dst1_l, dst, 8);
3027        dst += dst_stride;
3028    }
3029}
3030
3031static void hevc_vt_4t_24w_msa(uint8_t *src,
3032                               int32_t src_stride,
3033                               int16_t *dst,
3034                               int32_t dst_stride,
3035                               const int8_t *filter,
3036                               int32_t height)
3037{
3038    int32_t loop_cnt;
3039    v16i8 src0, src1, src2, src3, src4, src5;
3040    v16i8 src6, src7, src8, src9, src10, src11;
3041    v16i8 src10_r, src32_r, src76_r, src98_r;
3042    v16i8 src21_r, src43_r, src87_r, src109_r;
3043    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3044    v16i8 src10_l, src32_l, src21_l, src43_l;
3045    v8i16 dst0_l, dst1_l;
3046    v8i16 filt0, filt1;
3047    v8i16 filter_vec, const_vec;
3048
3049    src -= src_stride;
3050    const_vec = __msa_ldi_h(128);
3051    const_vec <<= 6;
3052
3053    filter_vec = LD_SH(filter);
3054    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3055
3056    LD_SB3(src, src_stride, src0, src1, src2);
3057    XORI_B3_128_SB(src0, src1, src2);
3058    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3059    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3060
3061    LD_SB3(src + 16, src_stride, src6, src7, src8);
3062    src += (3 * src_stride);
3063    XORI_B3_128_SB(src6, src7, src8);
3064    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3065
3066    for (loop_cnt = (height >> 2); loop_cnt--;) {
3067        LD_SB2(src, src_stride, src3, src4);
3068        XORI_B2_128_SB(src3, src4);
3069        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3070        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3071
3072        LD_SB2(src + 16, src_stride, src9, src10);
3073        src += (2 * src_stride);
3074        XORI_B2_128_SB(src9, src10);
3075        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3076
3077        dst0_r = const_vec;
3078        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3079        dst0_l = const_vec;
3080        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3081        dst1_r = const_vec;
3082        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3083        dst1_l = const_vec;
3084        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3085        dst2_r = const_vec;
3086        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3087        dst3_r = const_vec;
3088        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3089
3090        ST_SH2(dst0_r, dst0_l, dst, 8);
3091        ST_SH(dst2_r, dst + 16);
3092        dst += dst_stride;
3093        ST_SH2(dst1_r, dst1_l, dst, 8);
3094        ST_SH(dst3_r, dst + 16);
3095        dst += dst_stride;
3096
3097        LD_SB2(src, src_stride, src5, src2);
3098        XORI_B2_128_SB(src5, src2);
3099        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3100        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3101
3102        LD_SB2(src + 16, src_stride, src11, src8);
3103        src += (2 * src_stride);
3104        XORI_B2_128_SB(src11, src8);
3105        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3106
3107        dst0_r = const_vec;
3108        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3109        dst0_l = const_vec;
3110        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3111        dst1_r = const_vec;
3112        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3113        dst1_l = const_vec;
3114        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3115        dst2_r = const_vec;
3116        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3117        dst3_r = const_vec;
3118        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3119
3120        ST_SH2(dst0_r, dst0_l, dst, 8);
3121        ST_SH(dst2_r, dst + 16);
3122        dst += dst_stride;
3123        ST_SH2(dst1_r, dst1_l, dst, 8);
3124        ST_SH(dst3_r, dst + 16);
3125        dst += dst_stride;
3126    }
3127}
3128
3129static void hevc_vt_4t_32w_msa(uint8_t *src,
3130                               int32_t src_stride,
3131                               int16_t *dst,
3132                               int32_t dst_stride,
3133                               const int8_t *filter,
3134                               int32_t height)
3135{
3136    int32_t loop_cnt;
3137    v16i8 src0, src1, src2, src3, src4, src5;
3138    v16i8 src6, src7, src8, src9, src10, src11;
3139    v16i8 src10_r, src32_r, src76_r, src98_r;
3140    v16i8 src21_r, src43_r, src87_r, src109_r;
3141    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3142    v16i8 src10_l, src32_l, src76_l, src98_l;
3143    v16i8 src21_l, src43_l, src87_l, src109_l;
3144    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3145    v8i16 filt0, filt1;
3146    v8i16 filter_vec, const_vec;
3147
3148    src -= src_stride;
3149    const_vec = __msa_ldi_h(128);
3150    const_vec <<= 6;
3151
3152    filter_vec = LD_SH(filter);
3153    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3154
3155    LD_SB3(src, src_stride, src0, src1, src2);
3156    XORI_B3_128_SB(src0, src1, src2);
3157    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3158    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3159
3160    LD_SB3(src + 16, src_stride, src6, src7, src8);
3161    src += (3 * src_stride);
3162    XORI_B3_128_SB(src6, src7, src8);
3163    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3164    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3165
3166    for (loop_cnt = (height >> 2); loop_cnt--;) {
3167        LD_SB2(src, src_stride, src3, src4);
3168        XORI_B2_128_SB(src3, src4);
3169        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3170        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3171
3172        LD_SB2(src + 16, src_stride, src9, src10);
3173        src += (2 * src_stride);
3174        XORI_B2_128_SB(src9, src10);
3175        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3176        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3177
3178        dst0_r = const_vec;
3179        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3180        dst0_l = const_vec;
3181        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3182        dst1_r = const_vec;
3183        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3184        dst1_l = const_vec;
3185        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3186        dst2_r = const_vec;
3187        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3188        dst2_l = const_vec;
3189        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3190        dst3_r = const_vec;
3191        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3192        dst3_l = const_vec;
3193        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3194
3195        ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3196        dst += dst_stride;
3197        ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3198        dst += dst_stride;
3199
3200        LD_SB2(src, src_stride, src5, src2);
3201        XORI_B2_128_SB(src5, src2);
3202        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3203        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3204
3205        LD_SB2(src + 16, src_stride, src11, src8);
3206        src += (2 * src_stride);
3207        XORI_B2_128_SB(src11, src8);
3208        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3209        ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
3210
3211        dst0_r = const_vec;
3212        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3213        dst0_l = const_vec;
3214        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3215        dst1_r = const_vec;
3216        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3217        dst1_l = const_vec;
3218        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3219        dst2_r = const_vec;
3220        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3221        dst2_l = const_vec;
3222        DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
3223        dst3_r = const_vec;
3224        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3225        dst3_l = const_vec;
3226        DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
3227
3228        ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3229        dst += dst_stride;
3230        ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3231        dst += dst_stride;
3232    }
3233}
3234
3235static void hevc_hv_4t_4x2_msa(uint8_t *src,
3236                               int32_t src_stride,
3237                               int16_t *dst,
3238                               int32_t dst_stride,
3239                               const int8_t *filter_x,
3240                               const int8_t *filter_y)
3241{
3242    v16i8 src0, src1, src2, src3, src4;
3243    v8i16 filt0, filt1;
3244    v8i16 filt_h0, filt_h1;
3245    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3246    v16i8 mask1;
3247    v8i16 filter_vec, const_vec;
3248    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3249    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3250    v4i32 dst0, dst1;
3251
3252    src -= (src_stride + 1);
3253    filter_vec = LD_SH(filter_x);
3254    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3255
3256    filter_vec = LD_SH(filter_y);
3257    UNPCK_R_SB_SH(filter_vec, filter_vec);
3258
3259    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3260
3261    mask1 = mask0 + 2;
3262
3263    const_vec = __msa_ldi_h(128);
3264    const_vec <<= 6;
3265
3266    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3267    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3268    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3269    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3270    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3271
3272    dst20 = const_vec;
3273    dst31 = const_vec;
3274    dst42 = const_vec;
3275    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst20, dst20);
3276    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst31, dst31);
3277    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst42, dst42);
3278    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3279    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3280
3281    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3282    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3283    dst0 >>= 6;
3284    dst1 >>= 6;
3285    dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3286    ST_D2(dst0, 0, 1, dst, dst_stride);
3287}
3288
3289static void hevc_hv_4t_4x4_msa(uint8_t *src,
3290                               int32_t src_stride,
3291                               int16_t *dst,
3292                               int32_t dst_stride,
3293                               const int8_t *filter_x,
3294                               const int8_t *filter_y)
3295{
3296    v16i8 src0, src1, src2, src3, src4, src5, src6;
3297    v8i16 filt0, filt1;
3298    v8i16 filt_h0, filt_h1;
3299    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3300    v16i8 mask1;
3301    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3302    v8i16 filter_vec, const_vec;
3303    v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
3304    v4i32 dst0, dst1, dst2, dst3;
3305
3306    src -= (src_stride + 1);
3307
3308    filter_vec = LD_SH(filter_x);
3309    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3310
3311    filter_vec = LD_SH(filter_y);
3312    UNPCK_R_SB_SH(filter_vec, filter_vec);
3313
3314    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3315
3316    mask1 = mask0 + 2;
3317
3318    const_vec = __msa_ldi_h(128);
3319    const_vec <<= 6;
3320
3321    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3322    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3323
3324    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3325    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3326    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3327    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3328
3329    dst30 = const_vec;
3330    dst41 = const_vec;
3331    dst52 = const_vec;
3332    dst63 = const_vec;
3333    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst30, dst30);
3334    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst41, dst41);
3335    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst52, dst52);
3336    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst63, dst63);
3337
3338    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3339    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3340    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3341
3342    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3343    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3344    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3345    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3346    SRA_4V(dst0, dst1, dst2, dst3, 6);
3347    PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2);
3348    ST_D4(dst0, dst2, 0, 1, 0, 1, dst, dst_stride);
3349}
3350
3351
3352static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
3353                                       int32_t src_stride,
3354                                       int16_t *dst,
3355                                       int32_t dst_stride,
3356                                       const int8_t *filter_x,
3357                                       const int8_t *filter_y,
3358                                       int32_t height)
3359{
3360    uint32_t loop_cnt;
3361    v16i8 src0, src1, src2, src3, src4, src5, src6;
3362    v16i8 src7, src8, src9, src10;
3363    v8i16 filt0, filt1;
3364    v8i16 filt_h0, filt_h1;
3365    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3366    v16i8 mask1;
3367    v8i16 filter_vec, const_vec;
3368    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3369    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3370    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
3371    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
3372    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3373
3374    src -= (src_stride + 1);
3375    filter_vec = LD_SH(filter_x);
3376    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3377
3378    filter_vec = LD_SH(filter_y);
3379    UNPCK_R_SB_SH(filter_vec, filter_vec);
3380
3381    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3382
3383    mask1 = mask0 + 2;
3384
3385    const_vec = __msa_ldi_h(128);
3386    const_vec <<= 6;
3387
3388    LD_SB3(src, src_stride, src0, src1, src2);
3389    src += (3 * src_stride);
3390    XORI_B3_128_SB(src0, src1, src2);
3391    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3392    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3393    dst10 = const_vec;
3394    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
3395    dst21 = const_vec;
3396    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
3397    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3398    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3399
3400    for (loop_cnt = height >> 3; loop_cnt--;) {
3401        LD_SB8(src, src_stride,
3402               src3, src4, src5, src6, src7, src8, src9, src10);
3403        src += (8 * src_stride);
3404        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3405
3406        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3407        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3408        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3409        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3410
3411        dst73 = const_vec;
3412        dst84 = const_vec;
3413        dst95 = const_vec;
3414        dst106 = const_vec;
3415        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
3416        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
3417        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
3418        DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
3419
3420        dst32_r = __msa_ilvr_h(dst73, dst22);
3421        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3422        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3423        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3424        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3425        dst76_r = __msa_ilvr_h(dst22, dst106);
3426
3427        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3428        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3429        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3430        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3431        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3432        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3433        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3434        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3435        SRA_4V(dst0, dst1, dst2, dst3, 6);
3436        SRA_4V(dst4, dst5, dst6, dst7, 6);
3437        PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3438                    dst0, dst1, dst2, dst3);
3439        ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3440        dst += (8 * dst_stride);
3441
3442        dst10_r = dst98_r;
3443        dst21_r = dst109_r;
3444        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3445    }
3446}
3447
3448static void hevc_hv_4t_4w_msa(uint8_t *src,
3449                              int32_t src_stride,
3450                              int16_t *dst,
3451                              int32_t dst_stride,
3452                              const int8_t *filter_x,
3453                              const int8_t *filter_y,
3454                              int32_t height)
3455{
3456    if (2 == height) {
3457        hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride,
3458                           filter_x, filter_y);
3459    } else if (4 == height) {
3460        hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride,
3461                           filter_x, filter_y);
3462    } else if (0 == (height % 8)) {
3463        hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3464                                   filter_x, filter_y, height);
3465    }
3466}
3467
3468static void hevc_hv_4t_6w_msa(uint8_t *src,
3469                              int32_t src_stride,
3470                              int16_t *dst,
3471                              int32_t dst_stride,
3472                              const int8_t *filter_x,
3473                              const int8_t *filter_y,
3474                              int32_t height)
3475{
3476    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3477    v8i16 filt0, filt1;
3478    v8i16 filt_h0, filt_h1;
3479    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3480    v16i8 mask1;
3481    v8i16 filter_vec, const_vec;
3482    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3483    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3484    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3485    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
3486    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
3487    v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
3488    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3489    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3490    v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
3491
3492    src -= (src_stride + 1);
3493    filter_vec = LD_SH(filter_x);
3494    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3495
3496    filter_vec = LD_SH(filter_y);
3497    UNPCK_R_SB_SH(filter_vec, filter_vec);
3498
3499    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3500
3501    mask1 = mask0 + 2;
3502
3503    const_vec = __msa_ldi_h(128);
3504    const_vec <<= 6;
3505
3506    LD_SB3(src, src_stride, src0, src1, src2);
3507    src += (3 * src_stride);
3508    XORI_B3_128_SB(src0, src1, src2);
3509
3510    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3511    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3512    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3513
3514    dsth0 = const_vec;
3515    dsth1 = const_vec;
3516    dsth2 = const_vec;
3517    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth0, dsth0);
3518    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth1, dsth1);
3519    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth2, dsth2);
3520
3521    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3522    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3523
3524    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3525    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3526
3527    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3528    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3529    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3530    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3531
3532    dsth3 = const_vec;
3533    dsth4 = const_vec;
3534    dsth5 = const_vec;
3535    dsth6 = const_vec;
3536    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth3, dsth3);
3537    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth4, dsth4);
3538    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth5, dsth5);
3539    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth6, dsth6);
3540
3541    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3542    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3543    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3544    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3545
3546    dsth7 = const_vec;
3547    dsth8 = const_vec;
3548    dsth9 = const_vec;
3549    dsth10 = const_vec;
3550    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth7, dsth7);
3551    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth8, dsth8);
3552    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth9, dsth9);
3553    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10);
3554
3555    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3556    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3557    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3558    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3559    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3560    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3561    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3562    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3563
3564    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3565    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3566    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3567
3568    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3569    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3570    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3571    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3572    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3573    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3574    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3575    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3576    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3577    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3578    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3579    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3580    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3581    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3582    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3583    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3584    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3585    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3586    ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3587    ST_W4(tmp4, 0, 1, 2, 3, dst + 4, dst_stride);
3588    dst += 4 * dst_stride;
3589    ST_D4(tmp2, tmp3, 0, 1, 0, 1, dst, dst_stride);
3590    ST_W4(tmp5, 0, 1, 2, 3, dst + 4, dst_stride);
3591}
3592
3593static void hevc_hv_4t_8x2_msa(uint8_t *src,
3594                               int32_t src_stride,
3595                               int16_t *dst,
3596                               int32_t dst_stride,
3597                               const int8_t *filter_x,
3598                               const int8_t *filter_y)
3599{
3600    v16i8 src0, src1, src2, src3, src4;
3601    v8i16 filt0, filt1;
3602    v8i16 filt_h0, filt_h1;
3603    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3604    v16i8 mask1;
3605    v8i16 filter_vec, const_vec;
3606    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3607    v8i16 dst0, dst1, dst2, dst3, dst4;
3608    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3609    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3610    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3611
3612    src -= (src_stride + 1);
3613
3614    filter_vec = LD_SH(filter_x);
3615    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3616
3617    filter_vec = LD_SH(filter_y);
3618    UNPCK_R_SB_SH(filter_vec, filter_vec);
3619
3620    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3621
3622    mask1 = mask0 + 2;
3623
3624    const_vec = __msa_ldi_h(128);
3625    const_vec <<= 6;
3626
3627    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3628    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3629
3630    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3631    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3632    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3633    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3634    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3635
3636    dst0 = const_vec;
3637    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3638    dst1 = const_vec;
3639    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3640    dst2 = const_vec;
3641    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3642    dst3 = const_vec;
3643    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
3644    dst4 = const_vec;
3645    DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
3646
3647    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3648    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3649    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3650    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3651    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3652    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3653    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3654    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3655    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3656    PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3657    ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3658}
3659
3660static void hevc_hv_4t_8multx4_msa(uint8_t *src, int32_t src_stride,
3661                                   int16_t *dst, int32_t dst_stride,
3662                                   const int8_t *filter_x,
3663                                   const int8_t *filter_y, int32_t width8mult)
3664{
3665    int32_t cnt;
3666    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3667    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3668    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
3669    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3670    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3671    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3672    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3673
3674    src -= (src_stride + 1);
3675
3676    filter_vec = LD_SH(filter_x);
3677    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3678
3679    filter_vec = LD_SH(filter_y);
3680    UNPCK_R_SB_SH(filter_vec, filter_vec);
3681
3682    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3683
3684    mask0 = LD_SB(ff_hevc_mask_arr);
3685    mask1 = mask0 + 2;
3686
3687    const_vec = __msa_ldi_h(128);
3688    const_vec <<= 6;
3689
3690    for (cnt = width8mult; cnt--;) {
3691        LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3692        src += 8;
3693        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3694
3695        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3696        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3697        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3698
3699        dst0 = const_vec;
3700        dst1 = const_vec;
3701        dst2 = const_vec;
3702        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3703        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3704        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3705
3706        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3707        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3708
3709        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3710        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3711        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3712        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3713        dst3 = const_vec;
3714        dst4 = const_vec;
3715        dst5 = const_vec;
3716        dst6 = const_vec;
3717        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3718        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3719        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3720        DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3721        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3722        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3723        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3724        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3725        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3726        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3727        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3728        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3729
3730        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3731        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3732        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3733        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3734        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3735        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3736        PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3737        PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3738
3739        ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
3740        dst += 8;
3741    }
3742}
3743
3744static void hevc_hv_4t_8x6_msa(uint8_t *src,
3745                               int32_t src_stride,
3746                               int16_t *dst,
3747                               int32_t dst_stride,
3748                               const int8_t *filter_x,
3749                               const int8_t *filter_y)
3750{
3751    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3752    v8i16 filt0, filt1;
3753    v8i16 filt_h0, filt_h1;
3754    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3755    v16i8 mask1;
3756    v8i16 filter_vec, const_vec;
3757    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3758    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3759    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3760    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3761    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3762    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3763    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3764    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3765    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3766
3767    src -= (src_stride + 1);
3768
3769    filter_vec = LD_SH(filter_x);
3770    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3771
3772    filter_vec = LD_SH(filter_y);
3773    UNPCK_R_SB_SH(filter_vec, filter_vec);
3774
3775    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3776
3777    mask1 = mask0 + 2;
3778
3779    const_vec = __msa_ldi_h(128);
3780    const_vec <<= 6;
3781
3782    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3783    src += (5 * src_stride);
3784    LD_SB4(src, src_stride, src5, src6, src7, src8);
3785
3786    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3787    XORI_B4_128_SB(src5, src6, src7, src8);
3788
3789    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3790    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3791    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3792    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3793    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3794    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3795    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3796    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3797    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3798
3799    dst0 = const_vec;
3800    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3801    dst1 = const_vec;
3802    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3803    dst2 = const_vec;
3804    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3805    dst3 = const_vec;
3806    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
3807    dst4 = const_vec;
3808    DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
3809    dst5 = const_vec;
3810    DPADD_SB2_SH(vec10, vec11, filt0, filt1, dst5, dst5);
3811    dst6 = const_vec;
3812    DPADD_SB2_SH(vec12, vec13, filt0, filt1, dst6, dst6);
3813    dst7 = const_vec;
3814    DPADD_SB2_SH(vec14, vec15, filt0, filt1, dst7, dst7);
3815    dst8 = const_vec;
3816    DPADD_SB2_SH(vec16, vec17, filt0, filt1, dst8, dst8);
3817
3818    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3819    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3820    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3821    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3822    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3823    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3824    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3825    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3826
3827    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3828    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3829    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3830    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3831    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3832    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3833    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3834    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3835    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3836    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3837    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3838    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3839
3840    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3841    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3842    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3843
3844    PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3845                dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3846    PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3847
3848    ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3849    dst += (2 * dst_stride);
3850    ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3851    dst += (2 * dst_stride);
3852    ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3853}
3854
3855static void hevc_hv_4t_8multx4mult_msa(uint8_t *src,
3856                                       int32_t src_stride,
3857                                       int16_t *dst,
3858                                       int32_t dst_stride,
3859                                       const int8_t *filter_x,
3860                                       const int8_t *filter_y,
3861                                       int32_t height,
3862                                       int32_t width8mult)
3863{
3864    uint32_t loop_cnt, cnt;
3865    uint8_t *src_tmp;
3866    int16_t *dst_tmp;
3867    v16i8 src0, src1, src2, src3, src4, src5, src6;
3868    v8i16 filt0, filt1;
3869    v8i16 filt_h0, filt_h1;
3870    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3871    v16i8 mask1;
3872    v8i16 filter_vec, const_vec;
3873    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3874    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3875    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3876    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3877    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3878
3879    src -= (src_stride + 1);
3880
3881    filter_vec = LD_SH(filter_x);
3882    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3883
3884    filter_vec = LD_SH(filter_y);
3885    UNPCK_R_SB_SH(filter_vec, filter_vec);
3886
3887    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3888
3889    mask1 = mask0 + 2;
3890
3891    const_vec = __msa_ldi_h(128);
3892    const_vec <<= 6;
3893
3894    for (cnt = width8mult; cnt--;) {
3895        src_tmp = src;
3896        dst_tmp = dst;
3897
3898        LD_SB3(src_tmp, src_stride, src0, src1, src2);
3899        src_tmp += (3 * src_stride);
3900
3901        XORI_B3_128_SB(src0, src1, src2);
3902
3903        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3904        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3905        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3906
3907        dst0 = const_vec;
3908        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3909        dst1 = const_vec;
3910        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3911        dst2 = const_vec;
3912        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3913
3914        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3915        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3916
3917        for (loop_cnt = height >> 2; loop_cnt--;) {
3918            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3919            src_tmp += (4 * src_stride);
3920            XORI_B4_128_SB(src3, src4, src5, src6);
3921
3922            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3923            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3924            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3925            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3926
3927            dst3 = const_vec;
3928            dst4 = const_vec;
3929            dst5 = const_vec;
3930            dst6 = const_vec;
3931            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3932            DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3933            DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3934            DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3935
3936            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3937            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3938            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3939            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3940
3941            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3942            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3943            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3944            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3945            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3946            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3947            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3948            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3949
3950            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3951            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3952
3953            PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3954                        dst2_l, dst2_r, dst3_l, dst3_r,
3955                        dst0_r, dst1_r, dst2_r, dst3_r);
3956
3957            ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
3958            dst_tmp += (4 * dst_stride);
3959
3960            dst10_r = dst54_r;
3961            dst10_l = dst54_l;
3962            dst21_r = dst65_r;
3963            dst21_l = dst65_l;
3964            dst2 = dst6;
3965        }
3966
3967        src += 8;
3968        dst += 8;
3969    }
3970}
3971
3972static void hevc_hv_4t_8w_msa(uint8_t *src,
3973                              int32_t src_stride,
3974                              int16_t *dst,
3975                              int32_t dst_stride,
3976                              const int8_t *filter_x,
3977                              const int8_t *filter_y,
3978                              int32_t height)
3979{
3980
3981    if (2 == height) {
3982        hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride,
3983                           filter_x, filter_y);
3984    } else if (4 == height) {
3985        hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3986                               filter_x, filter_y, 1);
3987    } else if (6 == height) {
3988        hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride,
3989                           filter_x, filter_y);
3990    } else if (0 == (height % 4)) {
3991        hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3992                                   filter_x, filter_y, height, 1);
3993    }
3994}
3995
3996static void hevc_hv_4t_12w_msa(uint8_t *src,
3997                               int32_t src_stride,
3998                               int16_t *dst,
3999                               int32_t dst_stride,
4000                               const int8_t *filter_x,
4001                               const int8_t *filter_y,
4002                               int32_t height)
4003{
4004    uint32_t loop_cnt;
4005    uint8_t *src_tmp;
4006    int16_t *dst_tmp;
4007    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4008    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4009    v16i8 mask0, mask1, mask2, mask3;
4010    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4011    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
4012    v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
4013    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4014    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4015    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4016    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4017
4018    src -= (src_stride + 1);
4019
4020    filter_vec = LD_SH(filter_x);
4021    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4022
4023    filter_vec = LD_SH(filter_y);
4024    UNPCK_R_SB_SH(filter_vec, filter_vec);
4025
4026    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4027
4028    mask0 = LD_SB(ff_hevc_mask_arr);
4029    mask1 = mask0 + 2;
4030
4031    const_vec = __msa_ldi_h(128);
4032    const_vec <<= 6;
4033
4034    src_tmp = src;
4035    dst_tmp = dst;
4036
4037    LD_SB3(src_tmp, src_stride, src0, src1, src2);
4038    src_tmp += (3 * src_stride);
4039
4040    XORI_B3_128_SB(src0, src1, src2);
4041
4042    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4043    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4044    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4045
4046    dst0 = const_vec;
4047    dst1 = const_vec;
4048    dst2 = const_vec;
4049    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4050    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4051    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4052
4053    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4054    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4055
4056    for (loop_cnt = 4; loop_cnt--;) {
4057        LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4058        src_tmp += (4 * src_stride);
4059        XORI_B4_128_SB(src3, src4, src5, src6);
4060
4061        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4062        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4063        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4064        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4065
4066        dst3 = const_vec;
4067        dst4 = const_vec;
4068        dst5 = const_vec;
4069        dst6 = const_vec;
4070        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4071        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
4072        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
4073        DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
4074
4075        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4076        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4077        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4078        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4079
4080        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4081        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4082        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4083        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4084        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4085        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4086        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4087        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4088
4089        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4090        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4091        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4092                    dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
4093        ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
4094        dst_tmp += (4 * dst_stride);
4095
4096        dst10_r = dst54_r;
4097        dst10_l = dst54_l;
4098        dst21_r = dst65_r;
4099        dst21_l = dst65_l;
4100        dst2 = dst6;
4101    }
4102
4103    src += 8;
4104    dst += 8;
4105
4106    mask2 = LD_SB(ff_hevc_mask_arr + 16);
4107    mask3 = mask2 + 2;
4108
4109    LD_SB3(src, src_stride, src0, src1, src2);
4110    src += (3 * src_stride);
4111    XORI_B3_128_SB(src0, src1, src2);
4112    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4113    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4114    dst10 = const_vec;
4115    dst21 = const_vec;
4116    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
4117    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
4118    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4119    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4120
4121    for (loop_cnt = 2; loop_cnt--;) {
4122        LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
4123               src10);
4124        src += (8 * src_stride);
4125        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4126        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4127        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4128        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4129        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4130
4131        dst73 = const_vec;
4132        dst84 = const_vec;
4133        dst95 = const_vec;
4134        dst106 = const_vec;
4135        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
4136        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
4137        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
4138        DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
4139
4140        dst32_r = __msa_ilvr_h(dst73, dst22);
4141        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4142        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4143        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4144        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4145        dst76_r = __msa_ilvr_h(dst22, dst106);
4146
4147        tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4148        tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4149        tmp2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4150        tmp3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4151        tmp4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4152        tmp5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4153        tmp6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4154        tmp7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4155
4156        SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
4157        SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
4158        PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
4159                    tmp2, tmp3);
4160        ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4161        dst += (8 * dst_stride);
4162
4163        dst10_r = dst98_r;
4164        dst21_r = dst109_r;
4165        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4166    }
4167}
4168
4169static void hevc_hv_4t_16w_msa(uint8_t *src,
4170                               int32_t src_stride,
4171                               int16_t *dst,
4172                               int32_t dst_stride,
4173                               const int8_t *filter_x,
4174                               const int8_t *filter_y,
4175                               int32_t height)
4176{
4177    if (4 == height) {
4178        hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
4179                               filter_x, filter_y, 2);
4180    } else {
4181        hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4182                                   filter_x, filter_y, height, 2);
4183    }
4184}
4185
4186static void hevc_hv_4t_24w_msa(uint8_t *src,
4187                               int32_t src_stride,
4188                               int16_t *dst,
4189                               int32_t dst_stride,
4190                               const int8_t *filter_x,
4191                               const int8_t *filter_y,
4192                               int32_t height)
4193{
4194    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4195                               filter_x, filter_y, height, 3);
4196}
4197
4198static void hevc_hv_4t_32w_msa(uint8_t *src,
4199                               int32_t src_stride,
4200                               int16_t *dst,
4201                               int32_t dst_stride,
4202                               const int8_t *filter_x,
4203                               const int8_t *filter_y,
4204                               int32_t height)
4205{
4206    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4207                               filter_x, filter_y, height, 4);
4208}
4209
4210#define MC_COPY(WIDTH)                                                    \
4211void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst,             \
4212                                                uint8_t *src,             \
4213                                                ptrdiff_t src_stride,     \
4214                                                int height,               \
4215                                                intptr_t mx,              \
4216                                                intptr_t my,              \
4217                                                int width)                \
4218{                                                                         \
4219    hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height);  \
4220}
4221
4222MC_COPY(4);
4223MC_COPY(6);
4224MC_COPY(8);
4225MC_COPY(12);
4226MC_COPY(16);
4227MC_COPY(24);
4228MC_COPY(32);
4229MC_COPY(48);
4230MC_COPY(64);
4231
4232#undef MC_COPY
4233
4234#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
4235void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,          \
4236                                                   uint8_t *src,          \
4237                                                   ptrdiff_t src_stride,  \
4238                                                   int height,            \
4239                                                   intptr_t mx,           \
4240                                                   intptr_t my,           \
4241                                                   int width)             \
4242{                                                                         \
4243    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];         \
4244                                                                          \
4245    hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,           \
4246                                          MAX_PB_SIZE, filter, height);   \
4247}
4248
4249MC(qpel, h, 4, 8, hz, mx);
4250MC(qpel, h, 8, 8, hz, mx);
4251MC(qpel, h, 12, 8, hz, mx);
4252MC(qpel, h, 16, 8, hz, mx);
4253MC(qpel, h, 24, 8, hz, mx);
4254MC(qpel, h, 32, 8, hz, mx);
4255MC(qpel, h, 48, 8, hz, mx);
4256MC(qpel, h, 64, 8, hz, mx);
4257
4258MC(qpel, v, 4, 8, vt, my);
4259MC(qpel, v, 8, 8, vt, my);
4260MC(qpel, v, 12, 8, vt, my);
4261MC(qpel, v, 16, 8, vt, my);
4262MC(qpel, v, 24, 8, vt, my);
4263MC(qpel, v, 32, 8, vt, my);
4264MC(qpel, v, 48, 8, vt, my);
4265MC(qpel, v, 64, 8, vt, my);
4266
4267MC(epel, h, 4, 4, hz, mx);
4268MC(epel, h, 6, 4, hz, mx);
4269MC(epel, h, 8, 4, hz, mx);
4270MC(epel, h, 12, 4, hz, mx);
4271MC(epel, h, 16, 4, hz, mx);
4272MC(epel, h, 24, 4, hz, mx);
4273MC(epel, h, 32, 4, hz, mx);
4274
4275MC(epel, v, 4, 4, vt, my);
4276MC(epel, v, 6, 4, vt, my);
4277MC(epel, v, 8, 4, vt, my);
4278MC(epel, v, 12, 4, vt, my);
4279MC(epel, v, 16, 4, vt, my);
4280MC(epel, v, 24, 4, vt, my);
4281MC(epel, v, 32, 4, vt, my);
4282
4283#undef MC
4284
4285#define MC_HV(PEL, WIDTH, TAP)                                          \
4286void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst,           \
4287                                                uint8_t *src,           \
4288                                                ptrdiff_t src_stride,   \
4289                                                int height,             \
4290                                                intptr_t mx,            \
4291                                                intptr_t my,            \
4292                                                int width)              \
4293{                                                                       \
4294    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];           \
4295    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];           \
4296                                                                        \
4297    hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE,  \
4298                                          filter_x, filter_y, height);  \
4299}
4300
4301MC_HV(qpel, 4, 8);
4302MC_HV(qpel, 8, 8);
4303MC_HV(qpel, 12, 8);
4304MC_HV(qpel, 16, 8);
4305MC_HV(qpel, 24, 8);
4306MC_HV(qpel, 32, 8);
4307MC_HV(qpel, 48, 8);
4308MC_HV(qpel, 64, 8);
4309
4310MC_HV(epel, 4, 4);
4311MC_HV(epel, 6, 4);
4312MC_HV(epel, 8, 4);
4313MC_HV(epel, 12, 4);
4314MC_HV(epel, 16, 4);
4315MC_HV(epel, 24, 4);
4316MC_HV(epel, 32, 4);
4317
4318#undef MC_HV
4319