1/*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "libavcodec/mips/hevcdsp_mips.h"
23#include "libavcodec/mips/hevc_macros_msa.h"
24
25static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26    /* 8 width cases */
27    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28    /* 4 width cases */
29    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30    /* 4 width cases */
31    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32};
33
34#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
35                                   mask0, mask1, mask2, mask3,              \
36                                   filt0, filt1, filt2, filt3,              \
37                                   out0, out1)                              \
38{                                                                           \
39    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
40                                                                            \
41    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
42    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                  \
43    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
44    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                 \
45    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
46    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                 \
47    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
48    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1);                 \
49}
50
51#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
52                                   mask0, mask1, mask2, mask3,                \
53                                   filt0, filt1, filt2, filt3,                \
54                                   out0, out1, out2, out3)                    \
55{                                                                             \
56    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
57                                                                              \
58    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
59    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
60    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
61                out0, out1, out2, out3);                                      \
62    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
63    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
64    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,  \
65                 out0, out1, out2, out3);                                     \
66    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
67    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
68    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
69                 out0, out1, out2, out3);                                     \
70    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
71    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
72    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
73                 out0, out1, out2, out3);                                     \
74}
75
76#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
77                                   mask0, mask1, filt0, filt1,         \
78                                   out0, out1)                         \
79{                                                                      \
80    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
81                                                                       \
82    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
83    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
84    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
85    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
86}
87
88#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
89                                   mask0, mask1, filt0, filt1,                \
90                                   out0, out1, out2, out3)                    \
91{                                                                             \
92    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
93                                                                              \
94    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
95    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
96    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
97                out0, out1, out2, out3);                                      \
98    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
99    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
100    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
101                 out0, out1, out2, out3);                                     \
102}
103
104static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105                            uint8_t *dst, int32_t dst_stride,
106                            int32_t height)
107{
108    int32_t cnt;
109    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110
111    if (2 == height) {
112        LD2(src, src_stride, out0, out1);
113        SD(out0, dst);
114        dst += dst_stride;
115        SD(out1, dst);
116    } else if (6 == height) {
117        LD4(src, src_stride, out0, out1, out2, out3);
118        src += (4 * src_stride);
119        SD4(out0, out1, out2, out3, dst, dst_stride);
120        dst += (4 * dst_stride);
121        LD2(src, src_stride, out0, out1);
122        SD(out0, dst);
123        dst += dst_stride;
124        SD(out1, dst);
125    } else if (0 == (height % 8)) {
126        for (cnt = (height >> 3); cnt--;) {
127            LD4(src, src_stride, out0, out1, out2, out3);
128            src += (4 * src_stride);
129            LD4(src, src_stride, out4, out5, out6, out7);
130            src += (4 * src_stride);
131            SD4(out0, out1, out2, out3, dst, dst_stride);
132            dst += (4 * dst_stride);
133            SD4(out4, out5, out6, out7, dst, dst_stride);
134            dst += (4 * dst_stride);
135        }
136    } else if (0 == (height % 4)) {
137        for (cnt = (height >> 2); cnt--;) {
138            LD4(src, src_stride, out0, out1, out2, out3);
139            src += (4 * src_stride);
140            SD4(out0, out1, out2, out3, dst, dst_stride);
141            dst += (4 * dst_stride);
142        }
143    }
144}
145
146static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147                             uint8_t *dst, int32_t dst_stride,
148                             int32_t height)
149{
150    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151
152    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153    src += (8 * src_stride);
154    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155    dst += (8 * dst_stride);
156    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158}
159
160static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161                             uint8_t *dst, int32_t dst_stride,
162                             int32_t height)
163{
164    int32_t cnt;
165    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166
167    if (12 == height) {
168        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169        src += (8 * src_stride);
170        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171        dst += (8 * dst_stride);
172        LD_UB4(src, src_stride, src0, src1, src2, src3);
173        src += (4 * src_stride);
174        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175        dst += (4 * dst_stride);
176    } else if (0 == (height % 8)) {
177        for (cnt = (height >> 3); cnt--;) {
178            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179                   src7);
180            src += (8 * src_stride);
181            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182                   dst_stride);
183            dst += (8 * dst_stride);
184        }
185    } else if (0 == (height % 4)) {
186        for (cnt = (height >> 2); cnt--;) {
187            LD_UB4(src, src_stride, src0, src1, src2, src3);
188            src += (4 * src_stride);
189
190            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191            dst += (4 * dst_stride);
192        }
193    }
194}
195
196static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197                             uint8_t *dst, int32_t dst_stride,
198                             int32_t height)
199{
200    int32_t cnt;
201    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203
204    for (cnt = 4; cnt--;) {
205        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206        LD4(src + 16, src_stride, out0, out1, out2, out3);
207        src += (4 * src_stride);
208        LD4(src + 16, src_stride, out4, out5, out6, out7);
209        src += (4 * src_stride);
210
211        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212        SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213        dst += (4 * dst_stride);
214        SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215        dst += (4 * dst_stride);
216    }
217}
218
219static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220                             uint8_t *dst, int32_t dst_stride,
221                             int32_t height)
222{
223    int32_t cnt;
224    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225
226    for (cnt = (height >> 2); cnt--;) {
227        LD_UB4(src, src_stride, src0, src1, src2, src3);
228        LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229        src += (4 * src_stride);
230        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231        ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232        dst += (4 * dst_stride);
233    }
234}
235
236static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237                             uint8_t *dst, int32_t dst_stride,
238                             int32_t height)
239{
240    int32_t cnt;
241    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242    v16u8 src11;
243
244    for (cnt = (height >> 2); cnt--;) {
245        LD_UB4(src, src_stride, src0, src1, src2, src3);
246        LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247        LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248        src += (4 * src_stride);
249
250        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251        ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252        ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253        dst += (4 * dst_stride);
254    }
255}
256
257static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258                             uint8_t *dst, int32_t dst_stride,
259                             int32_t height)
260{
261    int32_t cnt;
262    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264
265    for (cnt = (height >> 2); cnt--;) {
266        LD_UB4(src, 16, src0, src1, src2, src3);
267        src += src_stride;
268        LD_UB4(src, 16, src4, src5, src6, src7);
269        src += src_stride;
270        LD_UB4(src, 16, src8, src9, src10, src11);
271        src += src_stride;
272        LD_UB4(src, 16, src12, src13, src14, src15);
273        src += src_stride;
274
275        ST_UB4(src0, src1, src2, src3, dst, 16);
276        dst += dst_stride;
277        ST_UB4(src4, src5, src6, src7, dst, 16);
278        dst += dst_stride;
279        ST_UB4(src8, src9, src10, src11, dst, 16);
280        dst += dst_stride;
281        ST_UB4(src12, src13, src14, src15, dst, 16);
282        dst += dst_stride;
283    }
284}
285
286static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287                                 uint8_t *dst, int32_t dst_stride,
288                                 const int8_t *filter)
289{
290    v16u8 mask0, mask1, mask2, mask3, out;
291    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292    v8i16 filt, out0, out1;
293
294    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295    src -= 3;
296
297    /* rearranging filter */
298    filt = LD_SH(filter);
299    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300
301    mask1 = mask0 + 2;
302    mask2 = mask0 + 4;
303    mask3 = mask0 + 6;
304
305    LD_SB4(src, src_stride, src0, src1, src2, src3);
306    XORI_B4_128_SB(src0, src1, src2, src3);
307    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308                               mask3, filt0, filt1, filt2, filt3, out0, out1);
309    SRARI_H2_SH(out0, out1, 6);
310    SAT_SH2_SH(out0, out1, 7);
311    out = PCKEV_XORI128_UB(out0, out1);
312    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
313}
314
315static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316                                 uint8_t *dst, int32_t dst_stride,
317                                 const int8_t *filter)
318{
319    v16i8 filt0, filt1, filt2, filt3;
320    v16i8 src0, src1, src2, src3;
321    v16u8 mask0, mask1, mask2, mask3, out;
322    v8i16 filt, out0, out1, out2, out3;
323
324    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325    src -= 3;
326
327    /* rearranging filter */
328    filt = LD_SH(filter);
329    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330
331    mask1 = mask0 + 2;
332    mask2 = mask0 + 4;
333    mask3 = mask0 + 6;
334
335    LD_SB4(src, src_stride, src0, src1, src2, src3);
336    XORI_B4_128_SB(src0, src1, src2, src3);
337    src += (4 * src_stride);
338    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339                               mask3, filt0, filt1, filt2, filt3, out0, out1);
340    LD_SB4(src, src_stride, src0, src1, src2, src3);
341    XORI_B4_128_SB(src0, src1, src2, src3);
342    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343                               mask3, filt0, filt1, filt2, filt3, out2, out3);
344    SRARI_H4_SH(out0, out1, out2, out3, 6);
345    SAT_SH4_SH(out0, out1, out2, out3, 7);
346    out = PCKEV_XORI128_UB(out0, out1);
347    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348    out = PCKEV_XORI128_UB(out2, out3);
349    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
350}
351
352static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
353                                  uint8_t *dst, int32_t dst_stride,
354                                  const int8_t *filter)
355{
356    v16u8 mask0, mask1, mask2, mask3, out;
357    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358    v8i16 filt, out0, out1, out2, out3;
359
360    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
361    src -= 3;
362
363    /* rearranging filter */
364    filt = LD_SH(filter);
365    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
366
367    mask1 = mask0 + 2;
368    mask2 = mask0 + 4;
369    mask3 = mask0 + 6;
370
371    LD_SB4(src, src_stride, src0, src1, src2, src3);
372    XORI_B4_128_SB(src0, src1, src2, src3);
373    src += (4 * src_stride);
374    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375                               mask3, filt0, filt1, filt2, filt3, out0, out1);
376    LD_SB4(src, src_stride, src0, src1, src2, src3);
377    XORI_B4_128_SB(src0, src1, src2, src3);
378    src += (4 * src_stride);
379    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380                               mask3, filt0, filt1, filt2, filt3, out2, out3);
381    SRARI_H4_SH(out0, out1, out2, out3, 6);
382    SAT_SH4_SH(out0, out1, out2, out3, 7);
383    out = PCKEV_XORI128_UB(out0, out1);
384    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385    out = PCKEV_XORI128_UB(out2, out3);
386    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387    dst += (8 * dst_stride);
388
389    LD_SB4(src, src_stride, src0, src1, src2, src3);
390    XORI_B4_128_SB(src0, src1, src2, src3);
391    src += (4 * src_stride);
392    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393                               mask3, filt0, filt1, filt2, filt3, out0, out1);
394    LD_SB4(src, src_stride, src0, src1, src2, src3);
395    XORI_B4_128_SB(src0, src1, src2, src3);
396    src += (4 * src_stride);
397    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398                               mask3, filt0, filt1, filt2, filt3, out2, out3);
399
400    SRARI_H4_SH(out0, out1, out2, out3, 6);
401    SAT_SH4_SH(out0, out1, out2, out3, 7);
402    out = PCKEV_XORI128_UB(out0, out1);
403    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404    out = PCKEV_XORI128_UB(out2, out3);
405    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
406}
407
408static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
409                                uint8_t *dst, int32_t dst_stride,
410                                const int8_t *filter, int32_t height)
411{
412    if (4 == height) {
413        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414    } else if (8 == height) {
415        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416    } else if (16 == height) {
417        common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
418    }
419}
420
421static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
422                                uint8_t *dst, int32_t dst_stride,
423                                const int8_t *filter, int32_t height)
424{
425    uint32_t loop_cnt;
426    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429    v8i16 filt, out0, out1, out2, out3;
430
431    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
432    src -= 3;
433
434    /* rearranging filter */
435    filt = LD_SH(filter);
436    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
437
438    mask1 = mask0 + 2;
439    mask2 = mask0 + 4;
440    mask3 = mask0 + 6;
441
442    for (loop_cnt = (height >> 2); loop_cnt--;) {
443        LD_SB4(src, src_stride, src0, src1, src2, src3);
444        XORI_B4_128_SB(src0, src1, src2, src3);
445        src += (4 * src_stride);
446
447        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450                    out0, out1, out2, out3);
451        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454                     out0, out1, out2, out3);
455        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457        DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458                     out0, out1, out2, out3);
459        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461        DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462                     out0, out1, out2, out3);
463
464        SRARI_H4_SH(out0, out1, out2, out3, 6);
465        SAT_SH4_SH(out0, out1, out2, out3, 7);
466        tmp0 = PCKEV_XORI128_UB(out0, out1);
467        tmp1 = PCKEV_XORI128_UB(out2, out3);
468        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469        dst += (4 * dst_stride);
470    }
471}
472
473static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
474                                 uint8_t *dst, int32_t dst_stride,
475                                 const int8_t *filter, int32_t height)
476{
477    uint32_t loop_cnt;
478    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479    v16u8 tmp0, tmp1, tmp2;
480    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482    v16i8 filt0, filt1, filt2, filt3;
483    v8i16 filt, out0, out1, out2, out3, out4, out5;
484
485    mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
487
488    src = src - 3;
489
490    /* rearranging filter */
491    filt = LD_SH(filter);
492    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
493
494    mask1 = mask00 + 2;
495    mask2 = mask00 + 4;
496    mask3 = mask00 + 6;
497    mask4 = mask0 + 2;
498    mask5 = mask0 + 4;
499    mask6 = mask0 + 6;
500
501    for (loop_cnt = 4; loop_cnt--;) {
502        /* 8 width */
503        LD_SB4(src, src_stride, src0, src1, src2, src3);
504        /* 4 width */
505        LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
506
507        XORI_B4_128_SB(src0, src1, src2, src3);
508        XORI_B4_128_SB(src4, src5, src6, src7);
509        src += (4 * src_stride);
510
511        VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512        VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
514                    out1, out2, out3);
515        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
518                     out1, out2, out3);
519        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
522                     out1, out2, out3);
523        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
526                     out1, out2, out3);
527
528        /* 4 width */
529        VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530        DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531        VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532        DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533        VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534        DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535        VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536        DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
537
538        SRARI_H4_SH(out0, out1, out2, out3, 6);
539        SRARI_H2_SH(out4, out5, 6);
540        SAT_SH4_SH(out0, out1, out2, out3, 7);
541        SAT_SH2_SH(out4, out5, 7);
542        tmp0 = PCKEV_XORI128_UB(out0, out1);
543        tmp1 = PCKEV_XORI128_UB(out2, out3);
544        tmp2 = PCKEV_XORI128_UB(out4, out5);
545
546        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547        ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548        dst += (4 * dst_stride);
549    }
550}
551
552static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
553                                 uint8_t *dst, int32_t dst_stride,
554                                 const int8_t *filter, int32_t height)
555{
556    uint32_t loop_cnt;
557    v16u8 mask0, mask1, mask2, mask3, out;
558    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559    v16i8 filt0, filt1, filt2, filt3;
560    v8i16 filt, out0, out1, out2, out3;
561
562    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
563    src -= 3;
564
565    /* rearranging filter */
566    filt = LD_SH(filter);
567    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568
569    mask1 = mask0 + 2;
570    mask2 = mask0 + 4;
571    mask3 = mask0 + 6;
572
573    for (loop_cnt = (height >> 2); loop_cnt--;) {
574        LD_SB2(src, src_stride, src0, src2);
575        LD_SB2(src + 8, src_stride, src1, src3);
576        src += (2 * src_stride);
577
578        LD_SB2(src, src_stride, src4, src6);
579        LD_SB2(src + 8, src_stride, src5, src7);
580        src += (2 * src_stride);
581
582        XORI_B4_128_SB(src0, src1, src2, src3);
583        XORI_B4_128_SB(src4, src5, src6, src7);
584        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585                                   mask3, filt0, filt1, filt2, filt3, out0,
586                                   out1, out2, out3);
587        SRARI_H4_SH(out0, out1, out2, out3, 6);
588        SAT_SH4_SH(out0, out1, out2, out3, 7);
589        out = PCKEV_XORI128_UB(out0, out1);
590        ST_UB(out, dst);
591        dst += dst_stride;
592        out = PCKEV_XORI128_UB(out2, out3);
593        ST_UB(out, dst);
594        dst += dst_stride;
595
596        HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597                                   mask3, filt0, filt1, filt2, filt3, out0,
598                                   out1, out2, out3);
599        SRARI_H4_SH(out0, out1, out2, out3, 6);
600        SAT_SH4_SH(out0, out1, out2, out3, 7);
601        out = PCKEV_XORI128_UB(out0, out1);
602        ST_UB(out, dst);
603        dst += dst_stride;
604        out = PCKEV_XORI128_UB(out2, out3);
605        ST_UB(out, dst);
606        dst += dst_stride;
607    }
608}
609
610static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
611                                 uint8_t *dst, int32_t dst_stride,
612                                 const int8_t *filter, int32_t height)
613{
614    uint32_t loop_cnt;
615    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
618    v16i8 vec11;
619    v8i16 out0, out1, out2, out3, out8, out9, filt;
620
621    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
622    src -= 3;
623
624    /* rearranging filter */
625    filt = LD_SH(filter);
626    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627
628    mask1 = mask0 + 2;
629    mask2 = mask0 + 4;
630    mask3 = mask0 + 6;
631    mask4 = mask0 + 8;
632    mask5 = mask0 + 10;
633    mask6 = mask0 + 12;
634    mask7 = mask0 + 14;
635
636    for (loop_cnt = 16; loop_cnt--;) {
637        LD_SB2(src, src_stride, src0, src2);
638        LD_SB2(src + 16, src_stride, src1, src3);
639        XORI_B4_128_SB(src0, src1, src2, src3);
640        src += (2 * src_stride);
641        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643        VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644        DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
645                    out8, out2, out9);
646        DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649        VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650        DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651                     out0, out8, out2, out9);
652        DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655        VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657                     out0, out8, out2, out9);
658        DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661        VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663                     out0, out8, out2, out9);
664        DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665        SRARI_H4_SH(out0, out8, out2, out9, 6);
666        SRARI_H2_SH(out1, out3, 6);
667        SAT_SH4_SH(out0, out8, out2, out9, 7);
668        SAT_SH2_SH(out1, out3, 7);
669        out = PCKEV_XORI128_UB(out8, out9);
670        ST_D2(out, 0, 1, dst + 16, dst_stride);
671        out = PCKEV_XORI128_UB(out0, out1);
672        ST_UB(out, dst);
673        dst += dst_stride;
674        out = PCKEV_XORI128_UB(out2, out3);
675        ST_UB(out, dst);
676        dst += dst_stride;
677    }
678}
679
680static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
681                                 uint8_t *dst, int32_t dst_stride,
682                                 const int8_t *filter, int32_t height)
683{
684    uint32_t loop_cnt;
685    v16u8 mask0, mask1, mask2, mask3, out;
686    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687    v16i8 filt0, filt1, filt2, filt3;
688    v8i16 filt, out0, out1, out2, out3;
689
690    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
691    src -= 3;
692
693    /* rearranging filter */
694    filt = LD_SH(filter);
695    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696
697    mask1 = mask0 + 2;
698    mask2 = mask0 + 4;
699    mask3 = mask0 + 6;
700
701    for (loop_cnt = (height >> 1); loop_cnt--;) {
702        src0 = LD_SB(src);
703        src1 = LD_SB(src + 8);
704        src2 = LD_SB(src + 16);
705        src3 = LD_SB(src + 24);
706        src += src_stride;
707        XORI_B4_128_SB(src0, src1, src2, src3);
708
709        src4 = LD_SB(src);
710        src5 = LD_SB(src + 8);
711        src6 = LD_SB(src + 16);
712        src7 = LD_SB(src + 24);
713        src += src_stride;
714        XORI_B4_128_SB(src4, src5, src6, src7);
715
716        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717                                   mask3, filt0, filt1, filt2, filt3, out0,
718                                   out1, out2, out3);
719        SRARI_H4_SH(out0, out1, out2, out3, 6);
720        SAT_SH4_SH(out0, out1, out2, out3, 7);
721
722        out = PCKEV_XORI128_UB(out0, out1);
723        ST_UB(out, dst);
724        out = PCKEV_XORI128_UB(out2, out3);
725        ST_UB(out, dst + 16);
726        dst += dst_stride;
727
728        HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729                                   mask3, filt0, filt1, filt2, filt3, out0,
730                                   out1, out2, out3);
731        SRARI_H4_SH(out0, out1, out2, out3, 6);
732        SAT_SH4_SH(out0, out1, out2, out3, 7);
733        out = PCKEV_XORI128_UB(out0, out1);
734        ST_UB(out, dst);
735        out = PCKEV_XORI128_UB(out2, out3);
736        ST_UB(out, dst + 16);
737        dst += dst_stride;
738    }
739}
740
741static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
742                                 uint8_t *dst, int32_t dst_stride,
743                                 const int8_t *filter, int32_t height)
744{
745    uint32_t loop_cnt;
746    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
747    v16i8 src4;
748    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749    v8i16 filt, out0, out1, out2, out3;
750
751    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
752    src -= 3;
753
754    /* rearranging filter */
755    filt = LD_SH(filter);
756    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757
758    mask1 = mask0 + 2;
759    mask2 = mask0 + 4;
760    mask3 = mask0 + 6;
761    mask4 = mask0 + 8;
762    mask5 = mask0 + 10;
763    mask6 = mask0 + 12;
764    mask7 = mask0 + 14;
765
766    for (loop_cnt = 64; loop_cnt--;) {
767        src0 = LD_SB(src);
768        src1 = LD_SB(src + 8);
769        src2 = LD_SB(src + 16);
770        src3 = LD_SB(src + 32);
771        src4 = LD_SB(src + 40);
772        src += src_stride;
773
774        XORI_B4_128_SB(src0, src1, src2, src3);
775        src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
776
777        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
778                   vec0, vec1, vec2);
779        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
781                   vec0, vec1, vec2);
782        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
785                   vec0, vec1, vec2);
786        DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787        out2 = __msa_dpadd_s_h(out2, vec2, filt2);
788
789        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
790                   vec0, vec1, vec2);
791        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792        out2 = __msa_dpadd_s_h(out2, vec2, filt3);
793
794        SRARI_H2_SH(out0, out1, 6);
795        out3 = __msa_srari_h(out2, 6);
796        SAT_SH3_SH(out0, out1, out3, 7);
797        out = PCKEV_XORI128_UB(out0, out1);
798        ST_UB(out, dst);
799
800        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
801                   vec0, vec1, vec2);
802        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
804                   vec0, vec1, vec2);
805        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
808                   vec0, vec1, vec2);
809        DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810        out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
812                   vec0, vec1, vec2);
813        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814        out2 = __msa_dpadd_s_h(out2, vec2, filt3);
815
816        SRARI_H2_SH(out0, out1, 6);
817        out2 = __msa_srari_h(out2, 6);
818        SAT_SH3_SH(out0, out1, out2, 7);
819        out = PCKEV_XORI128_UB(out3, out0);
820        ST_UB(out, dst + 16);
821        out = PCKEV_XORI128_UB(out1, out2);
822        ST_UB(out, dst + 32);
823        dst += dst_stride;
824    }
825}
826
827static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
828                                 uint8_t *dst, int32_t dst_stride,
829                                 const int8_t *filter, int32_t height)
830{
831    int32_t loop_cnt;
832    v16u8 mask0, mask1, mask2, mask3, out;
833    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835    v16i8 filt0, filt1, filt2, filt3;
836    v8i16 res0, res1, res2, res3, filt;
837
838    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
839    src -= 3;
840
841    /* rearranging filter */
842    filt = LD_SH(filter);
843    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
844
845    mask1 = mask0 + 2;
846    mask2 = mask0 + 4;
847    mask3 = mask0 + 6;
848
849    for (loop_cnt = height; loop_cnt--;) {
850        LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
851        src += src_stride;
852
853        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
854
855        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
858                    res1, res2, res3);
859        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
862                     res1, res2, res3);
863        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
866                     res1, res2, res3);
867        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
870                     res1, res2, res3);
871
872        SRARI_H4_SH(res0, res1, res2, res3, 6);
873        SAT_SH4_SH(res0, res1, res2, res3, 7);
874        out = PCKEV_XORI128_UB(res0, res1);
875        ST_UB(out, dst);
876        out = PCKEV_XORI128_UB(res2, res3);
877        ST_UB(out, dst + 16);
878
879        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
882                    res1, res2, res3);
883        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
886                     res1, res2, res3);
887        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
890                     res1, res2, res3);
891        VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892        VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894                     res1, res2, res3);
895
896        SRARI_H4_SH(res0, res1, res2, res3, 6);
897        SAT_SH4_SH(res0, res1, res2, res3, 7);
898        out = PCKEV_XORI128_UB(res0, res1);
899        ST_UB(out, dst + 32);
900        out = PCKEV_XORI128_UB(res2, res3);
901        ST_UB(out, dst + 48);
902        dst += dst_stride;
903    }
904}
905
906static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
907                                uint8_t *dst, int32_t dst_stride,
908                                const int8_t *filter, int32_t height)
909{
910    uint32_t loop_cnt;
911    v16u8 out0, out1;
912    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913    v16i8 src11, src12, src13, src14;
914    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916    v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917    v16i8 src10998, filt0, filt1, filt2, filt3;
918    v8i16 filt, out10, out32, out54, out76;
919
920    src -= (3 * src_stride);
921
922    filt = LD_SH(filter);
923    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
924
925    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926    src += (7 * src_stride);
927
928    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
929               src54_r, src21_r);
930    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
932               src4332, src6554);
933    XORI_B3_128_SB(src2110, src4332, src6554);
934
935    for (loop_cnt = (height >> 3); loop_cnt--;) {
936        LD_SB4(src, src_stride, src7, src8, src9, src10);
937        src += (4 * src_stride);
938        LD_SB4(src, src_stride, src11, src12, src13, src14);
939        src += (4 * src_stride);
940
941        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942                   src87_r, src98_r, src109_r);
943        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944                   src1110_r, src1211_r, src1312_r, src1413_r);
945        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946        ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947                   src12111110, src14131312);
948        XORI_B2_128_SB(src8776, src10998);
949        XORI_B2_128_SB(src12111110, src14131312);
950
951        DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952        DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953        DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954        DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955        DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956        DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957        DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958        DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
959        SRARI_H2_SH(out10, out32, 6);
960        SRARI_H2_SH(out54, out76, 6);
961        SAT_SH2_SH(out10, out32, 7);
962        SAT_SH2_SH(out54, out76, 7);
963        out0 = PCKEV_XORI128_UB(out10, out32);
964        out1 = PCKEV_XORI128_UB(out54, out76);
965        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966        dst += (8 * dst_stride);
967
968        src2110 = src10998;
969        src4332 = src12111110;
970        src6554 = src14131312;
971        src6 = src14;
972    }
973}
974
975static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
976                                uint8_t *dst, int32_t dst_stride,
977                                const int8_t *filter, int32_t height)
978{
979    uint32_t loop_cnt;
980    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
983    v16u8 tmp0, tmp1;
984    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
985
986    src -= (3 * src_stride);
987
988    filt = LD_SH(filter);
989    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
990
991    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
992    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
993    src += (7 * src_stride);
994    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
995               src54_r, src21_r);
996    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
997
998    for (loop_cnt = (height >> 2); loop_cnt--;) {
999        LD_SB4(src, src_stride, src7, src8, src9, src10);
1000        XORI_B4_128_SB(src7, src8, src9, src10);
1001        src += (4 * src_stride);
1002
1003        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004                   src87_r, src98_r, src109_r);
1005        DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006                    filt0, out0_r, out1_r, out2_r, out3_r);
1007        DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008                     filt1, out0_r, out1_r, out2_r, out3_r);
1009        DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010                     filt2, out0_r, out1_r, out2_r, out3_r);
1011        DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012                     filt3, out0_r, out1_r, out2_r, out3_r);
1013        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1014        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1015        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1016        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1017        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018        dst += (4 * dst_stride);
1019
1020        src10_r = src54_r;
1021        src32_r = src76_r;
1022        src54_r = src98_r;
1023        src21_r = src65_r;
1024        src43_r = src87_r;
1025        src65_r = src109_r;
1026        src6 = src10;
1027    }
1028}
1029
1030static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1031                                 uint8_t *dst, int32_t dst_stride,
1032                                 const int8_t *filter, int32_t height)
1033{
1034    uint32_t loop_cnt;
1035    uint32_t out2, out3;
1036    uint64_t out0, out1;
1037    v16u8 tmp0, tmp1, tmp2, tmp3;
1038    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039    v16i8 filt0, filt1, filt2, filt3;
1040    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1044
1045    src -= (3 * src_stride);
1046
1047    filt = LD_SH(filter);
1048    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1049
1050    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1051    src += (7 * src_stride);
1052
1053    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1054
1055    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1056               src54_r, src21_r);
1057    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1059               src54_l, src21_l);
1060    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1061
1062    for (loop_cnt = 4; loop_cnt--;) {
1063        LD_SB4(src, src_stride, src7, src8, src9, src10);
1064        XORI_B4_128_SB(src7, src8, src9, src10);
1065        src += (4 * src_stride);
1066
1067        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068                   src87_r, src98_r, src109_r);
1069        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070                   src87_l, src98_l, src109_l);
1071        out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1072                                   filt1, filt2, filt3);
1073        out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1074                                   filt1, filt2, filt3);
1075        out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1076                                   filt1, filt2, filt3);
1077        out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1078                                   filt1, filt2, filt3);
1079        out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1080                                   filt1, filt2, filt3);
1081        out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1082                                   filt1, filt2, filt3);
1083        out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1084                                   filt1, filt2, filt3);
1085        out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1086                                   filt1, filt2, filt3);
1087        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1088        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1089        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092                    out3_r, tmp0, tmp1, tmp2, tmp3);
1093        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1094
1095        out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096        out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097        out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098        out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1099        SD(out0, dst);
1100        SW(out2, (dst + 8));
1101        dst += dst_stride;
1102        SD(out1, dst);
1103        SW(out3, (dst + 8));
1104        dst += dst_stride;
1105        out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106        out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107        out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108        out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1109        SD(out0, dst);
1110        SW(out2, (dst + 8));
1111        dst += dst_stride;
1112        SD(out1, dst);
1113        SW(out3, (dst + 8));
1114        dst += dst_stride;
1115
1116        src10_r = src54_r;
1117        src32_r = src76_r;
1118        src54_r = src98_r;
1119        src21_r = src65_r;
1120        src43_r = src87_r;
1121        src65_r = src109_r;
1122        src10_l = src54_l;
1123        src32_l = src76_l;
1124        src54_l = src98_l;
1125        src21_l = src65_l;
1126        src43_l = src87_l;
1127        src65_l = src109_l;
1128        src6 = src10;
1129    }
1130}
1131
1132static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1133                                 uint8_t *dst, int32_t dst_stride,
1134                                 const int8_t *filter, int32_t height)
1135{
1136    uint32_t loop_cnt;
1137    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138    v16i8 filt0, filt1, filt2, filt3;
1139    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142    v16u8 tmp0, tmp1, tmp2, tmp3;
1143    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1144
1145    src -= (3 * src_stride);
1146
1147    filt = LD_SH(filter);
1148    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1149
1150    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1151    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1152    src += (7 * src_stride);
1153    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1154               src54_r, src21_r);
1155    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1157               src54_l, src21_l);
1158    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1159
1160    for (loop_cnt = (height >> 2); loop_cnt--;) {
1161        LD_SB4(src, src_stride, src7, src8, src9, src10);
1162        XORI_B4_128_SB(src7, src8, src9, src10);
1163        src += (4 * src_stride);
1164
1165        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166                   src87_r, src98_r, src109_r);
1167        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168                   src87_l, src98_l, src109_l);
1169        out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1170                                   filt1, filt2, filt3);
1171        out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1172                                   filt1, filt2, filt3);
1173        out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1174                                   filt1, filt2, filt3);
1175        out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1176                                   filt1, filt2, filt3);
1177        out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1178                                   filt1, filt2, filt3);
1179        out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1180                                   filt1, filt2, filt3);
1181        out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1182                                   filt1, filt2, filt3);
1183        out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1184                                   filt1, filt2, filt3);
1185        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1186        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1187        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190                    out3_r, tmp0, tmp1, tmp2, tmp3);
1191        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1192        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193        dst += (4 * dst_stride);
1194
1195        src10_r = src54_r;
1196        src32_r = src76_r;
1197        src54_r = src98_r;
1198        src21_r = src65_r;
1199        src43_r = src87_r;
1200        src65_r = src109_r;
1201        src10_l = src54_l;
1202        src32_l = src76_l;
1203        src54_l = src98_l;
1204        src21_l = src65_l;
1205        src43_l = src87_l;
1206        src65_l = src109_l;
1207        src6 = src10;
1208    }
1209}
1210
1211static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1212                                      uint8_t *dst, int32_t dst_stride,
1213                                      const int8_t *filter, int32_t height,
1214                                      int32_t width)
1215{
1216    uint8_t *src_tmp;
1217    uint8_t *dst_tmp;
1218    uint32_t loop_cnt, cnt;
1219    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220    v16i8 filt0, filt1, filt2, filt3;
1221    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224    v16u8 tmp0, tmp1, tmp2, tmp3;
1225    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1226
1227    src -= (3 * src_stride);
1228
1229    filt = LD_SH(filter);
1230    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1231
1232    for (cnt = (width >> 4); cnt--;) {
1233        src_tmp = src;
1234        dst_tmp = dst;
1235
1236        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1237        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1238        src_tmp += (7 * src_stride);
1239        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1240                   src32_r, src54_r, src21_r);
1241        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1242        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1243                   src32_l, src54_l, src21_l);
1244        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1245
1246        for (loop_cnt = (height >> 2); loop_cnt--;) {
1247            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1248            XORI_B4_128_SB(src7, src8, src9, src10);
1249            src_tmp += (4 * src_stride);
1250            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251                       src87_r, src98_r, src109_r);
1252            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253                       src87_l, src98_l, src109_l);
1254            out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1255                                       filt0, filt1, filt2, filt3);
1256            out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1257                                       filt0, filt1, filt2, filt3);
1258            out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1259                                       filt0, filt1, filt2, filt3);
1260            out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1261                                       filt0, filt1, filt2, filt3);
1262            out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1263                                       filt0, filt1, filt2, filt3);
1264            out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1265                                       filt0, filt1, filt2, filt3);
1266            out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1267                                       filt0, filt1, filt2, filt3);
1268            out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1269                                       filt0, filt1, filt2, filt3);
1270            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1271            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1272            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275                        out3_r, tmp0, tmp1, tmp2, tmp3);
1276            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1277            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278            dst_tmp += (4 * dst_stride);
1279
1280            src10_r = src54_r;
1281            src32_r = src76_r;
1282            src54_r = src98_r;
1283            src21_r = src65_r;
1284            src43_r = src87_r;
1285            src65_r = src109_r;
1286            src10_l = src54_l;
1287            src32_l = src76_l;
1288            src54_l = src98_l;
1289            src21_l = src65_l;
1290            src43_l = src87_l;
1291            src65_l = src109_l;
1292            src6 = src10;
1293        }
1294
1295        src += 16;
1296        dst += 16;
1297    }
1298}
1299
1300static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1301                                 uint8_t *dst, int32_t dst_stride,
1302                                 const int8_t *filter, int32_t height)
1303{
1304    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1305                              16);
1306
1307    common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1308                        height);
1309}
1310
1311static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1312                                 uint8_t *dst, int32_t dst_stride,
1313                                 const int8_t *filter, int32_t height)
1314{
1315    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1316                              32);
1317}
1318
1319static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1320                                 uint8_t *dst, int32_t dst_stride,
1321                                 const int8_t *filter, int32_t height)
1322{
1323    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1324                              48);
1325}
1326
1327static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1328                                 uint8_t *dst, int32_t dst_stride,
1329                                 const int8_t *filter, int32_t height)
1330{
1331    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1332                              64);
1333}
1334
1335static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1336                                  int32_t src_stride,
1337                                  uint8_t *dst,
1338                                  int32_t dst_stride,
1339                                  const int8_t *filter_x,
1340                                  const int8_t *filter_y,
1341                                  int32_t height)
1342{
1343    uint32_t loop_cnt;
1344    v16u8 out0, out1;
1345    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346    v16i8 src9, src10, src11, src12, src13, src14;
1347    v8i16 filt0, filt1, filt2, filt3;
1348    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349    v16i8 mask1, mask2, mask3;
1350    v8i16 filter_vec;
1351    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353    v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1357    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1358
1359    src -= ((3 * src_stride) + 3);
1360    filter_vec = LD_SH(filter_x);
1361    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362
1363    filter_vec = LD_SH(filter_y);
1364    UNPCK_R_SB_SH(filter_vec, filter_vec);
1365
1366    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1367
1368    mask1 = mask0 + 2;
1369    mask2 = mask0 + 4;
1370    mask3 = mask0 + 6;
1371
1372    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373    src += (7 * src_stride);
1374    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1375
1376    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379               vec8, vec9, vec10, vec11);
1380    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381               vec12, vec13, vec14, vec15);
1382
1383    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1384                              filt3);
1385    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1386                              filt3);
1387    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1388                              filt3);
1389    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1390                              filt3);
1391
1392    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1393    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1394    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1395
1396    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1397
1398    for (loop_cnt = height >> 3; loop_cnt--;) {
1399        LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1400               src14);
1401        src += (8 * src_stride);
1402        XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1403
1404        VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405                   vec0, vec1, vec2, vec3);
1406        VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407                   vec4, vec5, vec6, vec7);
1408        VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409                   vec8, vec9, vec10, vec11);
1410        VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411                   vec12, vec13, vec14, vec15);
1412
1413        dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1414                                   filt3);
1415        dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1416                                   filt3);
1417        dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1418                                   filt2, filt3);
1419        dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1420                                   filt2, filt3);
1421
1422        dst76_r = __msa_ilvr_h(dst117, dst66);
1423        ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1424        ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1425        ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426        dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427        dst1110_r = __msa_ilvr_h(dst117, dst1410);
1428
1429        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430                                filt_h1, filt_h2, filt_h3);
1431        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432                                filt_h1, filt_h2, filt_h3);
1433        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434                                filt_h1, filt_h2, filt_h3);
1435        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436                                filt_h1, filt_h2, filt_h3);
1437        dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438                                filt_h1, filt_h2, filt_h3);
1439        dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440                                filt_h1, filt_h2, filt_h3);
1441        dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442                                filt_h1, filt_h2, filt_h3);
1443        dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1444                                filt_h0, filt_h1, filt_h2, filt_h3);
1445
1446        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1448        SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1449        SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450        SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451        SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453        PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1454        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1455        out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1456        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457        dst += (8 * dst_stride);
1458
1459        dst10_r = dst98_r;
1460        dst32_r = dst1110_r;
1461        dst54_r = dst1312_r;
1462        dst21_r = dst109_r;
1463        dst43_r = dst1211_r;
1464        dst65_r = dst1413_r;
1465        dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1466    }
1467}
1468
1469static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1470                                           int32_t src_stride,
1471                                           uint8_t *dst,
1472                                           int32_t dst_stride,
1473                                           const int8_t *filter_x,
1474                                           const int8_t *filter_y,
1475                                           int32_t height, int32_t width)
1476{
1477    uint32_t loop_cnt, cnt;
1478    uint8_t *src_tmp;
1479    uint8_t *dst_tmp;
1480    v16u8 out;
1481    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1482    v8i16 filt0, filt1, filt2, filt3;
1483    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484    v16i8 mask1, mask2, mask3;
1485    v8i16 filter_vec;
1486    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1494    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1495
1496    src -= ((3 * src_stride) + 3);
1497
1498    filter_vec = LD_SH(filter_x);
1499    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1500
1501    filter_vec = LD_SH(filter_y);
1502    UNPCK_R_SB_SH(filter_vec, filter_vec);
1503
1504    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1505
1506    mask1 = mask0 + 2;
1507    mask2 = mask0 + 4;
1508    mask3 = mask0 + 6;
1509
1510    for (cnt = width >> 3; cnt--;) {
1511        src_tmp = src;
1512        dst_tmp = dst;
1513
1514        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1515        src_tmp += (7 * src_stride);
1516        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1517
1518        /* row 0 row 1 row 2 row 3 */
1519        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1520                   vec0, vec1, vec2, vec3);
1521        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1522                   vec4, vec5, vec6, vec7);
1523        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524                   vec8, vec9, vec10, vec11);
1525        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526                   vec12, vec13, vec14, vec15);
1527        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1528                                 filt3);
1529        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1530                                 filt3);
1531        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1532                                 filt3);
1533        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1534                                 filt2, filt3);
1535
1536        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537                   vec0, vec1, vec2, vec3);
1538        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539                   vec4, vec5, vec6, vec7);
1540        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541                   vec8, vec9, vec10, vec11);
1542        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1543                                 filt3);
1544        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1545                                 filt3);
1546        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1547                                 filt3);
1548
1549        for (loop_cnt = height >> 1; loop_cnt--;) {
1550            LD_SB2(src_tmp, src_stride, src7, src8);
1551            XORI_B2_128_SB(src7, src8);
1552            src_tmp += 2 * src_stride;
1553
1554            ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555                       dst10_r, dst32_r, dst54_r, dst21_r);
1556            ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557                       dst10_l, dst32_l, dst54_l, dst21_l);
1558            ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559            ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1560
1561            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562                       vec0, vec1, vec2, vec3);
1563            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1564                                     filt2, filt3);
1565
1566            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1567            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1568                                    filt_h0, filt_h1, filt_h2, filt_h3);
1569            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1570                                    filt_h0, filt_h1, filt_h2, filt_h3);
1571            dst0_r >>= 6;
1572            dst0_l >>= 6;
1573
1574            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575                       vec0, vec1, vec2, vec3);
1576            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1577                                     filt2, filt3);
1578
1579            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1580            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1581                                    filt_h0, filt_h1, filt_h2, filt_h3);
1582            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1583                                    filt_h0, filt_h1, filt_h2, filt_h3);
1584            dst1_r >>= 6;
1585            dst1_l >>= 6;
1586            SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1587            SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1588
1589            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1590            out = PCKEV_XORI128_UB(dst0, dst1);
1591            ST_D2(out, 0, 1, dst_tmp, dst_stride);
1592            dst_tmp += (2 * dst_stride);
1593
1594            dst0 = dst2;
1595            dst1 = dst3;
1596            dst2 = dst4;
1597            dst3 = dst5;
1598            dst4 = dst6;
1599            dst5 = dst7;
1600            dst6 = dst8;
1601        }
1602
1603        src += 8;
1604        dst += 8;
1605    }
1606}
1607
1608static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1609                                  int32_t src_stride,
1610                                  uint8_t *dst,
1611                                  int32_t dst_stride,
1612                                  const int8_t *filter_x,
1613                                  const int8_t *filter_y,
1614                                  int32_t height)
1615{
1616    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1617                                   filter_x, filter_y, height, 8);
1618}
1619
1620static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1621                                   int32_t src_stride,
1622                                   uint8_t *dst,
1623                                   int32_t dst_stride,
1624                                   const int8_t *filter_x,
1625                                   const int8_t *filter_y,
1626                                   int32_t height)
1627{
1628    uint32_t loop_cnt;
1629    uint8_t *src_tmp, *dst_tmp;
1630    v16u8 out0, out1;
1631    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1632    v16i8 src11, src12, src13, src14;
1633    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1634    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1635    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1636    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1637    v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1638    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1639    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1640    v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1641    v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1642    v8i16 dst1413_r, dst87_l, filter_vec;
1643    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1644    v4i32 dst0_l, dst1_l;
1645
1646    src -= ((3 * src_stride) + 3);
1647
1648    filter_vec = LD_SH(filter_x);
1649    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1650
1651    filter_vec = LD_SH(filter_y);
1652    UNPCK_R_SB_SH(filter_vec, filter_vec);
1653
1654    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1655
1656    mask0 = LD_SB(ff_hevc_mask_arr);
1657    mask1 = mask0 + 2;
1658    mask2 = mask0 + 4;
1659    mask3 = mask0 + 6;
1660
1661    src_tmp = src;
1662    dst_tmp = dst;
1663
1664    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1665    src_tmp += (7 * src_stride);
1666    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1667
1668    /* row 0 row 1 row 2 row 3 */
1669    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1670    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1671    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1672               vec11);
1673    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1674               vec15);
1675    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1676                             filt3);
1677    dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1678                             filt3);
1679    dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1680                             filt3);
1681    dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1682                             filt2, filt3);
1683
1684    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1685    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1686    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1687               vec11);
1688    dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1689                             filt3);
1690    dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1691                             filt3);
1692    dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1693                             filt3);
1694
1695    for (loop_cnt = 8; loop_cnt--;) {
1696        LD_SB2(src_tmp, src_stride, src7, src8);
1697        XORI_B2_128_SB(src7, src8);
1698        src_tmp += 2 * src_stride;
1699
1700        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1701                   dst32_r, dst54_r, dst21_r);
1702        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1703                   dst32_l, dst54_l, dst21_l);
1704        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1705        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1706
1707        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1708                   vec3);
1709        dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1710                                 filt3);
1711
1712        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1713        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1714                                filt_h0, filt_h1, filt_h2, filt_h3);
1715        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1716                                filt_h0, filt_h1, filt_h2, filt_h3);
1717        dst0_r >>= 6;
1718        dst0_l >>= 6;
1719
1720        VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1721                   vec3);
1722        dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1723                                 filt3);
1724
1725        ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1726        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1727                                filt_h0, filt_h1, filt_h2, filt_h3);
1728        dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1729                                filt_h0, filt_h1, filt_h2, filt_h3);
1730        dst1_r >>= 6;
1731        dst1_l >>= 6;
1732        SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1733        SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1734
1735        PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1736        out0 = PCKEV_XORI128_UB(dst0, dst1);
1737        ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1738        dst_tmp += (2 * dst_stride);
1739
1740        dst0 = dst2;
1741        dst1 = dst3;
1742        dst2 = dst4;
1743        dst3 = dst5;
1744        dst4 = dst6;
1745        dst5 = dst7;
1746        dst6 = dst8;
1747    }
1748
1749    src += 8;
1750    dst += 8;
1751
1752    mask4 = LD_SB(ff_hevc_mask_arr + 16);
1753    mask5 = mask4 + 2;
1754    mask6 = mask4 + 4;
1755    mask7 = mask4 + 6;
1756
1757    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1758    src += (7 * src_stride);
1759    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1760
1761    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1762    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1763    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1764               vec11);
1765    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1766               vec15);
1767
1768    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1769                              filt3);
1770    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1771                              filt3);
1772    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1773                              filt3);
1774    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1775                              filt3);
1776
1777    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1778    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1779    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1780
1781    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1782
1783    for (loop_cnt = 2; loop_cnt--;) {
1784        LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1785               src14);
1786        src += (8 * src_stride);
1787        XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1788
1789        VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1790                   vec3);
1791        VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1792                   vec7);
1793        VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1794                   vec11);
1795        VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1796                   vec14, vec15);
1797
1798        dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1799                                   filt3);
1800        dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1801                                   filt3);
1802        dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1803                                   filt2, filt3);
1804        dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1805                                   filt2, filt3);
1806
1807        dst76_r = __msa_ilvr_h(dst117, dst66);
1808        ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1809        ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1810        ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1811        dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1812        dst1110_r = __msa_ilvr_h(dst117, dst1410);
1813
1814        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1815                                filt_h1, filt_h2, filt_h3);
1816        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1817                                filt_h1, filt_h2, filt_h3);
1818        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1819                                filt_h1, filt_h2, filt_h3);
1820        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1821                                filt_h1, filt_h2, filt_h3);
1822        dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1823                                filt_h1, filt_h2, filt_h3);
1824        dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1825                                filt_h1, filt_h2, filt_h3);
1826        dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1827                                filt_h1, filt_h2, filt_h3);
1828        dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1829                                filt_h0, filt_h1, filt_h2, filt_h3);
1830
1831        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1832        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1833        SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1834        SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1835        SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1836        SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1837        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1838        PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1839        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1840        out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1841        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1842        dst += (8 * dst_stride);
1843
1844        dst10_r = dst98_r;
1845        dst32_r = dst1110_r;
1846        dst54_r = dst1312_r;
1847        dst21_r = dst109_r;
1848        dst43_r = dst1211_r;
1849        dst65_r = dst1413_r;
1850        dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1851    }
1852}
1853
1854static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1855                                   int32_t src_stride,
1856                                   uint8_t *dst,
1857                                   int32_t dst_stride,
1858                                   const int8_t *filter_x,
1859                                   const int8_t *filter_y,
1860                                   int32_t height)
1861{
1862    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1863                                   filter_x, filter_y, height, 16);
1864}
1865
1866static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1867                                   int32_t src_stride,
1868                                   uint8_t *dst,
1869                                   int32_t dst_stride,
1870                                   const int8_t *filter_x,
1871                                   const int8_t *filter_y,
1872                                   int32_t height)
1873{
1874    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1875                                   filter_x, filter_y, height, 24);
1876}
1877
1878static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1879                                   int32_t src_stride,
1880                                   uint8_t *dst,
1881                                   int32_t dst_stride,
1882                                   const int8_t *filter_x,
1883                                   const int8_t *filter_y,
1884                                   int32_t height)
1885{
1886    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1887                                   filter_x, filter_y, height, 32);
1888}
1889
1890static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1891                                   int32_t src_stride,
1892                                   uint8_t *dst,
1893                                   int32_t dst_stride,
1894                                   const int8_t *filter_x,
1895                                   const int8_t *filter_y,
1896                                   int32_t height)
1897{
1898    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1899                                   filter_x, filter_y, height, 48);
1900}
1901
1902static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1903                                   int32_t src_stride,
1904                                   uint8_t *dst,
1905                                   int32_t dst_stride,
1906                                   const int8_t *filter_x,
1907                                   const int8_t *filter_y,
1908                                   int32_t height)
1909{
1910    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1911                                   filter_x, filter_y, height, 64);
1912}
1913
1914static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1915                                 uint8_t *dst, int32_t dst_stride,
1916                                 const int8_t *filter)
1917{
1918    v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1919    v16u8 out;
1920    v8i16 filt, res0;
1921
1922    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1923    src -= 1;
1924
1925    /* rearranging filter */
1926    filt = LD_SH(filter);
1927    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1928
1929    mask1 = mask0 + 2;
1930
1931    LD_SB2(src, src_stride, src0, src1);
1932    XORI_B2_128_SB(src0, src1);
1933    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1934    res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1935    res0 = __msa_srari_h(res0, 6);
1936    res0 = __msa_sat_s_h(res0, 7);
1937    out = PCKEV_XORI128_UB(res0, res0);
1938    ST_W2(out, 0, 1, dst, dst_stride);
1939}
1940
1941static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1942                                 uint8_t *dst, int32_t dst_stride,
1943                                 const int8_t *filter)
1944{
1945    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1946    v8i16 filt, out0, out1;
1947    v16u8 out;
1948
1949    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1950    src -= 1;
1951
1952    /* rearranging filter */
1953    filt = LD_SH(filter);
1954    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1955
1956    mask1 = mask0 + 2;
1957
1958    LD_SB4(src, src_stride, src0, src1, src2, src3);
1959    XORI_B4_128_SB(src0, src1, src2, src3);
1960    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1961                               filt0, filt1, out0, out1);
1962    SRARI_H2_SH(out0, out1, 6);
1963    SAT_SH2_SH(out0, out1, 7);
1964    out = PCKEV_XORI128_UB(out0, out1);
1965    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1966}
1967
1968static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1969                                 uint8_t *dst, int32_t dst_stride,
1970                                 const int8_t *filter)
1971{
1972    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1973    v16u8 out;
1974    v8i16 filt, out0, out1, out2, out3;
1975
1976    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1977    src -= 1;
1978
1979    /* rearranging filter */
1980    filt = LD_SH(filter);
1981    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1982
1983    mask1 = mask0 + 2;
1984
1985    LD_SB4(src, src_stride, src0, src1, src2, src3);
1986    src += (4 * src_stride);
1987
1988    XORI_B4_128_SB(src0, src1, src2, src3);
1989    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1990                               filt0, filt1, out0, out1);
1991    LD_SB4(src, src_stride, src0, src1, src2, src3);
1992    XORI_B4_128_SB(src0, src1, src2, src3);
1993    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1994                               filt0, filt1, out2, out3);
1995    SRARI_H4_SH(out0, out1, out2, out3, 6);
1996    SAT_SH4_SH(out0, out1, out2, out3, 7);
1997    out = PCKEV_XORI128_UB(out0, out1);
1998    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1999    out = PCKEV_XORI128_UB(out2, out3);
2000    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2001}
2002
2003static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2004                                  uint8_t *dst, int32_t dst_stride,
2005                                  const int8_t *filter)
2006{
2007    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2008    v16i8 filt0, filt1, mask0, mask1;
2009    v16u8 out;
2010    v8i16 filt, out0, out1, out2, out3;
2011
2012    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2013    src -= 1;
2014
2015    /* rearranging filter */
2016    filt = LD_SH(filter);
2017    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2018
2019    mask1 = mask0 + 2;
2020
2021    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2022    src += (8 * src_stride);
2023    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2024    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2025                               filt0, filt1, out0, out1);
2026    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2027                               filt0, filt1, out2, out3);
2028    SRARI_H4_SH(out0, out1, out2, out3, 6);
2029    SAT_SH4_SH(out0, out1, out2, out3, 7);
2030    out = PCKEV_XORI128_UB(out0, out1);
2031    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2032    out = PCKEV_XORI128_UB(out2, out3);
2033    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2034    dst += (8 * dst_stride);
2035
2036    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2037    src += (8 * src_stride);
2038    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2039    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2040                               filt0, filt1, out0, out1);
2041    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2042                               filt0, filt1, out2, out3);
2043    SRARI_H4_SH(out0, out1, out2, out3, 6);
2044    SAT_SH4_SH(out0, out1, out2, out3, 7);
2045    out = PCKEV_XORI128_UB(out0, out1);
2046    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2047    out = PCKEV_XORI128_UB(out2, out3);
2048    ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2049}
2050
2051static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2052                                uint8_t *dst, int32_t dst_stride,
2053                                const int8_t *filter, int32_t height)
2054{
2055    if (2 == height) {
2056        common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2057    } else if (4 == height) {
2058        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2059    } else if (8 == height) {
2060        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2061    } else if (16 == height) {
2062        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2063    }
2064}
2065
2066static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2067                                uint8_t *dst, int32_t dst_stride,
2068                                const int8_t *filter, int32_t height)
2069{
2070    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2071    v16u8 out4, out5;
2072    v8i16 filt, out0, out1, out2, out3;
2073
2074    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2075    src -= 1;
2076
2077    /* rearranging filter */
2078    filt = LD_SH(filter);
2079    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2080
2081    mask1 = mask0 + 2;
2082
2083    LD_SB4(src, src_stride, src0, src1, src2, src3);
2084    src += (4 * src_stride);
2085
2086    XORI_B4_128_SB(src0, src1, src2, src3);
2087    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2088                               filt1, out0, out1, out2, out3);
2089    SRARI_H4_SH(out0, out1, out2, out3, 6);
2090    SAT_SH4_SH(out0, out1, out2, out3, 7);
2091    out4 = PCKEV_XORI128_UB(out0, out1);
2092    out5 = PCKEV_XORI128_UB(out2, out3);
2093    ST_W2(out4, 0, 2, dst, dst_stride);
2094    ST_H2(out4, 2, 6, dst + 4, dst_stride);
2095    ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2096    ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2097    dst += (4 * dst_stride);
2098
2099    LD_SB4(src, src_stride, src0, src1, src2, src3);
2100    src += (4 * src_stride);
2101
2102    XORI_B4_128_SB(src0, src1, src2, src3);
2103    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2104                               filt1, out0, out1, out2, out3);
2105    SRARI_H4_SH(out0, out1, out2, out3, 6);
2106    SAT_SH4_SH(out0, out1, out2, out3, 7);
2107    out4 = PCKEV_XORI128_UB(out0, out1);
2108    out5 = PCKEV_XORI128_UB(out2, out3);
2109    ST_W2(out4, 0, 2, dst, dst_stride);
2110    ST_H2(out4, 2, 6, dst + 4, dst_stride);
2111    ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2112    ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2113}
2114
2115static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2116                                     uint8_t *dst, int32_t dst_stride,
2117                                     const int8_t *filter, int32_t height)
2118{
2119    uint32_t loop_cnt;
2120    v16i8 src0, src1, filt0, filt1, mask0, mask1;
2121    v16u8 out;
2122    v8i16 filt, vec0, vec1, vec2, vec3;
2123
2124    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2125    src -= 1;
2126
2127    filt = LD_SH(filter);
2128    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2129
2130    mask1 = mask0 + 2;
2131
2132    for (loop_cnt = (height >> 1); loop_cnt--;) {
2133        LD_SB2(src, src_stride, src0, src1);
2134        src += (2 * src_stride);
2135
2136        XORI_B2_128_SB(src0, src1);
2137        VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2138        DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2139        VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2140        DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2141        SRARI_H2_SH(vec0, vec1, 6);
2142        SAT_SH2_SH(vec0, vec1, 7);
2143        out = PCKEV_XORI128_UB(vec0, vec1);
2144        ST_D2(out, 0, 1, dst, dst_stride);
2145        dst += (2 * dst_stride);
2146    }
2147}
2148
2149static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2150                                     uint8_t *dst, int32_t dst_stride,
2151                                     const int8_t *filter, int32_t height)
2152{
2153    uint32_t loop_cnt;
2154    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2155    v16u8 tmp0, tmp1;
2156    v8i16 filt, out0, out1, out2, out3;
2157
2158    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2159    src -= 1;
2160
2161    /* rearranging filter */
2162    filt = LD_SH(filter);
2163    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2164
2165    mask1 = mask0 + 2;
2166
2167    for (loop_cnt = (height >> 2); loop_cnt--;) {
2168        LD_SB4(src, src_stride, src0, src1, src2, src3);
2169        src += (4 * src_stride);
2170
2171        XORI_B4_128_SB(src0, src1, src2, src3);
2172        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2173                                   filt1, out0, out1, out2, out3);
2174        SRARI_H4_SH(out0, out1, out2, out3, 6);
2175        SAT_SH4_SH(out0, out1, out2, out3, 7);
2176        tmp0 = PCKEV_XORI128_UB(out0, out1);
2177        tmp1 = PCKEV_XORI128_UB(out2, out3);
2178        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2179        dst += (4 * dst_stride);
2180    }
2181}
2182
2183static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2184                                uint8_t *dst, int32_t dst_stride,
2185                                const int8_t *filter, int32_t height)
2186{
2187    if ((2 == height) || (6 == height)) {
2188        common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2189                                 height);
2190    } else {
2191        common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2192                                 height);
2193    }
2194}
2195
2196static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2197                                 uint8_t *dst, int32_t dst_stride,
2198                                 const int8_t *filter, int32_t height)
2199{
2200    uint32_t loop_cnt;
2201    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2202    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2203    v16i8 vec10, vec11;
2204    v16u8 tmp0, tmp1;
2205    v8i16 filt, out0, out1, out2, out3, out4, out5;
2206
2207    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2208    mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2209
2210    src -= 1;
2211
2212    /* rearranging filter */
2213    filt = LD_SH(filter);
2214    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2215
2216    mask1 = mask0 + 2;
2217    mask3 = mask2 + 2;
2218
2219    for (loop_cnt = 4; loop_cnt--;) {
2220        LD_SB4(src, src_stride, src0, src1, src2, src3);
2221        src += (4 * src_stride);
2222
2223        XORI_B4_128_SB(src0, src1, src2, src3);
2224        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2225        DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2226        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2227        DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2228        SRARI_H2_SH(out0, out1, 6);
2229        SAT_SH2_SH(out0, out1, 7);
2230        tmp0 = PCKEV_XORI128_UB(out0, out1);
2231        ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2232
2233        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2234        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2235        DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2236                    out2, out3, out4, out5);
2237        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2238        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2239        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2240                     out2, out3, out4, out5);
2241        SRARI_H4_SH(out2, out3, out4, out5, 6);
2242        SAT_SH4_SH(out2, out3, out4, out5, 7);
2243        tmp0 = PCKEV_XORI128_UB(out2, out3);
2244        tmp1 = PCKEV_XORI128_UB(out4, out5);
2245        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2246        dst += (4 * dst_stride);
2247    }
2248}
2249
2250static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2251                                 uint8_t *dst, int32_t dst_stride,
2252                                 const int8_t *filter, int32_t height)
2253{
2254    uint32_t loop_cnt;
2255    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2256    v16i8 filt0, filt1, mask0, mask1;
2257    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2258    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2259    v16u8 out;
2260
2261    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2262    src -= 1;
2263
2264    /* rearranging filter */
2265    filt = LD_SH(filter);
2266    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2267
2268    mask1 = mask0 + 2;
2269
2270    for (loop_cnt = (height >> 2); loop_cnt--;) {
2271        LD_SB4(src, src_stride, src0, src2, src4, src6);
2272        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2273        src += (4 * src_stride);
2274
2275        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2276
2277        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2278        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2279        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2280                    out0, out1, out2, out3);
2281        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2282        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2283        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2284                     out0, out1, out2, out3);
2285        SRARI_H4_SH(out0, out1, out2, out3, 6);
2286        SAT_SH4_SH(out0, out1, out2, out3, 7);
2287        out = PCKEV_XORI128_UB(out0, out1);
2288        ST_UB(out, dst);
2289        dst += dst_stride;
2290        out = PCKEV_XORI128_UB(out2, out3);
2291        ST_UB(out, dst);
2292        dst += dst_stride;
2293
2294        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2295        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2296        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2297                    out4, out5, out6, out7);
2298        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2299        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2300        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2301                     out4, out5, out6, out7);
2302        SRARI_H4_SH(out4, out5, out6, out7, 6);
2303        SAT_SH4_SH(out4, out5, out6, out7, 7);
2304        out = PCKEV_XORI128_UB(out4, out5);
2305        ST_UB(out, dst);
2306        dst += dst_stride;
2307        out = PCKEV_XORI128_UB(out6, out7);
2308        ST_UB(out, dst);
2309        dst += dst_stride;
2310    }
2311}
2312
2313static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2314                                 uint8_t *dst, int32_t dst_stride,
2315                                 const int8_t *filter, int32_t height)
2316{
2317    uint8_t *dst1 = dst + 16;
2318    uint32_t loop_cnt;
2319    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2320    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2321    v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2322    v8i16 filt, out0, out1, out2, out3;
2323    v16u8 tmp0, tmp1;
2324
2325    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2326    src -= 1;
2327
2328    /* rearranging filter */
2329    filt = LD_SH(filter);
2330    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2331
2332    mask1 = mask0 + 2;
2333    mask00 = mask0 + 8;
2334    mask11 = mask0 + 10;
2335
2336    for (loop_cnt = 8; loop_cnt--;) {
2337        LD_SB4(src, src_stride, src0, src2, src4, src6);
2338        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2339        src += (4 * src_stride);
2340
2341        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2342        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2343        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2344        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2345        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2346        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2347                    out0, out1, out2, out3);
2348        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2349                     out0, out1, out2, out3);
2350        SRARI_H4_SH(out0, out1, out2, out3, 6);
2351        SAT_SH4_SH(out0, out1, out2, out3, 7);
2352        tmp0 = PCKEV_XORI128_UB(out0, out1);
2353        ST_UB(tmp0, dst);
2354        dst += dst_stride;
2355        tmp0 = PCKEV_XORI128_UB(out2, out3);
2356        ST_UB(tmp0, dst);
2357        dst += dst_stride;
2358
2359        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2360        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2361        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2362        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2363        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2364                    out0, out1, out2, out3);
2365        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2366                     out0, out1, out2, out3);
2367        SRARI_H4_SH(out0, out1, out2, out3, 6);
2368        SAT_SH4_SH(out0, out1, out2, out3, 7);
2369        tmp0 = PCKEV_XORI128_UB(out0, out1);
2370        ST_UB(tmp0, dst);
2371        dst += dst_stride;
2372        tmp0 = PCKEV_XORI128_UB(out2, out3);
2373        ST_UB(tmp0, dst);
2374        dst += dst_stride;
2375
2376        /* 8 width */
2377        VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2378        VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2379        VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2380        VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2381
2382        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2383                    out0, out1, out2, out3);
2384        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2385                     out0, out1, out2, out3);
2386
2387        SRARI_H4_SH(out0, out1, out2, out3, 6);
2388        SAT_SH4_SH(out0, out1, out2, out3, 7);
2389        tmp0 = PCKEV_XORI128_UB(out0, out1);
2390        tmp1 = PCKEV_XORI128_UB(out2, out3);
2391        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2392        dst1 += (4 * dst_stride);
2393    }
2394}
2395
2396static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2397                                 uint8_t *dst, int32_t dst_stride,
2398                                 const int8_t *filter, int32_t height)
2399{
2400    uint32_t loop_cnt;
2401    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2402    v16i8 filt0, filt1, mask0, mask1;
2403    v16u8 out;
2404    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2405    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2406
2407    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2408    src -= 1;
2409
2410    /* rearranging filter */
2411    filt = LD_SH(filter);
2412    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2413
2414    mask1 = mask0 + 2;
2415
2416    for (loop_cnt = (height >> 1); loop_cnt--;) {
2417        src0 = LD_SB(src);
2418        src1 = LD_SB(src + 8);
2419        src2 = LD_SB(src + 16);
2420        src3 = LD_SB(src + 24);
2421        src += src_stride;
2422        src4 = LD_SB(src);
2423        src5 = LD_SB(src + 8);
2424        src6 = LD_SB(src + 16);
2425        src7 = LD_SB(src + 24);
2426        src += src_stride;
2427
2428        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2429
2430        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2431        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2432        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2433                    out0, out1, out2, out3);
2434        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2435        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2436        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2437                     out0, out1, out2, out3);
2438
2439        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2440        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2441        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2442                    out4, out5, out6, out7);
2443        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2444        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2445        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2446                     out4, out5, out6, out7);
2447        SRARI_H4_SH(out0, out1, out2, out3, 6);
2448        SRARI_H4_SH(out4, out5, out6, out7, 6);
2449        SAT_SH4_SH(out0, out1, out2, out3, 7);
2450        SAT_SH4_SH(out4, out5, out6, out7, 7);
2451        out = PCKEV_XORI128_UB(out0, out1);
2452        ST_UB(out, dst);
2453        out = PCKEV_XORI128_UB(out2, out3);
2454        ST_UB(out, dst + 16);
2455        dst += dst_stride;
2456        out = PCKEV_XORI128_UB(out4, out5);
2457        ST_UB(out, dst);
2458        out = PCKEV_XORI128_UB(out6, out7);
2459        ST_UB(out, dst + 16);
2460        dst += dst_stride;
2461    }
2462}
2463
2464static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2465                                 uint8_t *dst, int32_t dst_stride,
2466                                 const int8_t *filter)
2467{
2468    v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2469    v16i8 src2110, src4332, filt0, filt1;
2470    v16u8 out;
2471    v8i16 filt, out10;
2472
2473    src -= src_stride;
2474
2475    filt = LD_SH(filter);
2476    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2477
2478    LD_SB3(src, src_stride, src0, src1, src2);
2479    src += (3 * src_stride);
2480
2481    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2482    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2483    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2484    LD_SB2(src, src_stride, src3, src4);
2485    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2486    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2487    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2488    out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2489    out10 = __msa_srari_h(out10, 6);
2490    out10 = __msa_sat_s_h(out10, 7);
2491    out = PCKEV_XORI128_UB(out10, out10);
2492    ST_W2(out, 0, 1, dst, dst_stride);
2493}
2494
2495static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2496                                         uint8_t *dst, int32_t dst_stride,
2497                                         const int8_t *filter, int32_t height)
2498{
2499    uint32_t loop_cnt;
2500    v16i8 src0, src1, src2, src3, src4, src5;
2501    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2502    v16i8 src2110, src4332, filt0, filt1;
2503    v8i16 filt, out10, out32;
2504    v16u8 out;
2505
2506    src -= src_stride;
2507
2508    filt = LD_SH(filter);
2509    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2510
2511    LD_SB3(src, src_stride, src0, src1, src2);
2512    src += (3 * src_stride);
2513
2514    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2515
2516    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2517    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2518
2519    for (loop_cnt = (height >> 2); loop_cnt--;) {
2520        LD_SB3(src, src_stride, src3, src4, src5);
2521        src += (3 * src_stride);
2522        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2523        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2524        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2525        out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2526
2527        src2 = LD_SB(src);
2528        src += (src_stride);
2529        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2530        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2531        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2532        out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2533        SRARI_H2_SH(out10, out32, 6);
2534        SAT_SH2_SH(out10, out32, 7);
2535        out = PCKEV_XORI128_UB(out10, out32);
2536        ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2537        dst += (4 * dst_stride);
2538    }
2539}
2540
2541static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2542                                uint8_t *dst, int32_t dst_stride,
2543                                const int8_t *filter, int32_t height)
2544{
2545    if (2 == height) {
2546        common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2547    } else {
2548        common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2549                                     height);
2550    }
2551}
2552
2553static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2554                                uint8_t *dst, int32_t dst_stride,
2555                                const int8_t *filter, int32_t height)
2556{
2557    v16u8 out0, out1;
2558    v16i8 src0, src1, src2, src3, src4, src5, src6;
2559    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2560    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2561
2562    src -= src_stride;
2563
2564    filter_vec = LD_SH(filter);
2565    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2566
2567    LD_SB3(src, src_stride, src0, src1, src2);
2568    src += (3 * src_stride);
2569    XORI_B3_128_SB(src0, src1, src2);
2570    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2571
2572    LD_SB2(src, src_stride, src3, src4);
2573    src += (2 * src_stride);
2574    XORI_B2_128_SB(src3, src4);
2575    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2576
2577    dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2578    dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2579
2580    LD_SB2(src, src_stride, src5, src6);
2581    src += (2 * src_stride);
2582    XORI_B2_128_SB(src5, src6);
2583    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2584
2585    dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2586    dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2587
2588    SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2589    SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2590    out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2591    out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2592    ST_W2(out0, 0, 2, dst, dst_stride);
2593    ST_H2(out0, 2, 6, dst + 4, dst_stride);
2594    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2595    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2596    dst += (4 * dst_stride);
2597
2598    LD_SB2(src, src_stride, src3, src4);
2599    src += (2 * src_stride);
2600    XORI_B2_128_SB(src3, src4);
2601    ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2602
2603    dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2604    dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2605
2606    LD_SB2(src, src_stride, src5, src6);
2607    src += (2 * src_stride);
2608    XORI_B2_128_SB(src5, src6);
2609    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2610
2611    dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2612    dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2613
2614    SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2615    SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2616    out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2617    out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2618    ST_W2(out0, 0, 2, dst, dst_stride);
2619    ST_H2(out0, 2, 6, dst + 4, dst_stride);
2620    ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2621    ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2622}
2623
2624static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2625                                 uint8_t *dst, int32_t dst_stride,
2626                                 const int8_t *filter)
2627{
2628    v16i8 src0, src1, src2, src3, src4;
2629    v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2630    v16u8 out;
2631
2632    src -= src_stride;
2633
2634    /* rearranging filter_y */
2635    filt = LD_SH(filter);
2636    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2637
2638    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2639    XORI_B5_128_SB(src0, src1, src2, src3, src4);
2640    ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2641    tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2642    ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2643    tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2644    SRARI_H2_SH(tmp0, tmp1, 6);
2645    SAT_SH2_SH(tmp0, tmp1, 7);
2646    out = PCKEV_XORI128_UB(tmp0, tmp1);
2647    ST_D2(out, 0, 1, dst, dst_stride);
2648}
2649
2650static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2651                                 uint8_t *dst, int32_t dst_stride,
2652                                 const int8_t *filter)
2653{
2654    uint32_t loop_cnt;
2655    uint64_t out0, out1, out2;
2656    v16i8 src0, src1, src2, src3, src4, src5;
2657    v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2658    v8i16 filt, filt0, filt1;
2659
2660    src -= src_stride;
2661
2662    /* rearranging filter_y */
2663    filt = LD_SH(filter);
2664    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2665
2666    LD_SB3(src, src_stride, src0, src1, src2);
2667    src += (3 * src_stride);
2668
2669    XORI_B3_128_SB(src0, src1, src2);
2670    ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2671
2672    for (loop_cnt = 2; loop_cnt--;) {
2673        LD_SB3(src, src_stride, src3, src4, src5);
2674        src += (3 * src_stride);
2675
2676        XORI_B3_128_SB(src3, src4, src5);
2677        ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2678        tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2679        tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2680        tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2681        SRARI_H2_SH(tmp0, tmp1, 6);
2682        tmp2 = __msa_srari_h(tmp2, 6);
2683        SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2684        PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2685        XORI_B2_128_SH(tmp0, tmp2);
2686
2687        out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2688        out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2689        out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2690        SD(out0, dst);
2691        dst += dst_stride;
2692        SD(out1, dst);
2693        dst += dst_stride;
2694        SD(out2, dst);
2695        dst += dst_stride;
2696
2697        src2 = src5;
2698        vec0 = vec3;
2699        vec2 = vec4;
2700    }
2701}
2702
2703static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2704                                     uint8_t *dst, int32_t dst_stride,
2705                                     const int8_t *filter, int32_t height)
2706{
2707    uint32_t loop_cnt;
2708    v16i8 src0, src1, src2, src7, src8, src9, src10;
2709    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2710    v16u8 tmp0, tmp1;
2711    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2712
2713    src -= src_stride;
2714
2715    filt = LD_SH(filter);
2716    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2717
2718    LD_SB3(src, src_stride, src0, src1, src2);
2719    src += (3 * src_stride);
2720
2721    XORI_B3_128_SB(src0, src1, src2);
2722    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2723
2724    for (loop_cnt = (height >> 2); loop_cnt--;) {
2725        LD_SB4(src, src_stride, src7, src8, src9, src10);
2726        src += (4 * src_stride);
2727
2728        XORI_B4_128_SB(src7, src8, src9, src10);
2729        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2730                   src72_r, src87_r, src98_r, src109_r);
2731        out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2732        out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2733        out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2734        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2735        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2736        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2737        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2738        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2739        ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2740        dst += (4 * dst_stride);
2741
2742        src10_r = src98_r;
2743        src21_r = src109_r;
2744        src2 = src10;
2745    }
2746}
2747
2748static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2749                                uint8_t *dst, int32_t dst_stride,
2750                                const int8_t *filter, int32_t height)
2751{
2752    if (2 == height) {
2753        common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2754    } else if (6 == height) {
2755        common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2756    } else {
2757        common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2758                                 filter, height);
2759    }
2760}
2761
2762static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2763                                 uint8_t *dst, int32_t dst_stride,
2764                                 const int8_t *filter, int32_t height)
2765{
2766    uint32_t loop_cnt;
2767    v16i8 src0, src1, src2, src3, src4, src5, src6;
2768    v16u8 out0, out1;
2769    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2770    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2771    v16i8 src2110, src4332, src6554;
2772    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2773    v8i16 filter_vec;
2774
2775    src -= (1 * src_stride);
2776
2777    filter_vec = LD_SH(filter);
2778    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2779
2780    LD_SB3(src, src_stride, src0, src1, src2);
2781    src += (3 * src_stride);
2782
2783    XORI_B3_128_SB(src0, src1, src2);
2784    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2785    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2786    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2787
2788    for (loop_cnt = 4; loop_cnt--;) {
2789        LD_SB4(src, src_stride, src3, src4, src5, src6);
2790        src += (4 * src_stride);
2791
2792        XORI_B4_128_SB(src3, src4, src5, src6);
2793        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2795        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2796        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2797        ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2798        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2799
2800        dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2801        dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2802        dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2803        dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2804        dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2805        dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2806
2807        SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2808        SRARI_H2_SH(dst0_l, dst1_l, 6);
2809        SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2810        SAT_SH2_SH(dst0_l, dst1_l, 7);
2811        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2812        out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2813        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2814        out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2815        ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2816        dst += (4 * dst_stride);
2817
2818        src2 = src6;
2819        src10_r = src54_r;
2820        src21_r = src65_r;
2821        src2110 = src6554;
2822    }
2823}
2824
2825static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2826                                 uint8_t *dst, int32_t dst_stride,
2827                                 const int8_t *filter, int32_t height)
2828{
2829    uint32_t loop_cnt;
2830    v16i8 src0, src1, src2, src3, src4, src5, src6;
2831    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2832    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2833    v16u8 tmp0, tmp1, tmp2, tmp3;
2834    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2835
2836    src -= src_stride;
2837
2838    filt = LD_SH(filter);
2839    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2840
2841    LD_SB3(src, src_stride, src0, src1, src2);
2842    src += (3 * src_stride);
2843
2844    XORI_B3_128_SB(src0, src1, src2);
2845    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2846    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2847
2848    for (loop_cnt = (height >> 2); loop_cnt--;) {
2849        LD_SB4(src, src_stride, src3, src4, src5, src6);
2850        src += (4 * src_stride);
2851
2852        XORI_B4_128_SB(src3, src4, src5, src6);
2853        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2854                   src32_r, src43_r, src54_r, src65_r);
2855        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2856                   src32_l, src43_l, src54_l, src65_l);
2857        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2858        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2859        out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2860        out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2861        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2862        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2863        out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2864        out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2865        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2866        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2867        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2868        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2869        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2870                    out3_r, tmp0, tmp1, tmp2, tmp3);
2871        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2872        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2873        dst += (4 * dst_stride);
2874
2875        src10_r = src54_r;
2876        src21_r = src65_r;
2877        src10_l = src54_l;
2878        src21_l = src65_l;
2879        src2 = src6;
2880    }
2881}
2882
2883static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2884                                 uint8_t *dst, int32_t dst_stride,
2885                                 const int8_t *filter, int32_t height)
2886{
2887    uint32_t loop_cnt;
2888    uint64_t out0, out1;
2889    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2890    v16i8 src11, filt0, filt1;
2891    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2892    v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2893    v16u8 out;
2894    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2895
2896    src -= src_stride;
2897
2898    filt = LD_SH(filter);
2899    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2900
2901    /* 16 width */
2902    LD_SB3(src, src_stride, src0, src1, src2);
2903    XORI_B3_128_SB(src0, src1, src2);
2904    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2906
2907    /* 8 width */
2908    LD_SB3(src + 16, src_stride, src6, src7, src8);
2909    src += (3 * src_stride);
2910    XORI_B3_128_SB(src6, src7, src8);
2911    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2912
2913    for (loop_cnt = 8; loop_cnt--;) {
2914        /* 16 width */
2915        LD_SB2(src, src_stride, src3, src4);
2916        XORI_B2_128_SB(src3, src4);
2917        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2919
2920        /* 8 width */
2921        LD_SB2(src + 16, src_stride, src9, src10);
2922        src += (2 * src_stride);
2923        XORI_B2_128_SB(src9, src10);
2924        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2925
2926        /* 16 width */
2927        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2928        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2929        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2930        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2931
2932        /* 8 width */
2933        out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2934        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2935
2936        /* 16 + 8 width */
2937        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2938        SRARI_H2_SH(out0_l, out1_l, 6);
2939        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2940        SAT_SH2_SH(out0_l, out1_l, 7);
2941        out = PCKEV_XORI128_UB(out0_r, out0_l);
2942        ST_UB(out, dst);
2943        PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2944        XORI_B2_128_SH(out2_r, out3_r);
2945        out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2946        out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2947        SD(out0, dst + 16);
2948        dst += dst_stride;
2949        out = PCKEV_XORI128_UB(out1_r, out1_l);
2950        ST_UB(out, dst);
2951        SD(out1, dst + 16);
2952        dst += dst_stride;
2953
2954        /* 16 width */
2955        LD_SB2(src, src_stride, src5, src2);
2956        XORI_B2_128_SB(src5, src2);
2957        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2959
2960        /* 8 width */
2961        LD_SB2(src + 16, src_stride, src11, src8);
2962        src += (2 * src_stride);
2963        XORI_B2_128_SB(src11, src8);
2964        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2965
2966        /* 16 width */
2967        out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2968        out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2969        out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2970        out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2971
2972        /* 8 width */
2973        out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2974        out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2975
2976        /* 16 + 8 width */
2977        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2978        SRARI_H2_SH(out0_l, out1_l, 6);
2979        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2980        SAT_SH2_SH(out0_l, out1_l, 7);
2981        out = PCKEV_XORI128_UB(out0_r, out0_l);
2982        ST_UB(out, dst);
2983        out = PCKEV_XORI128_UB(out2_r, out2_r);
2984        ST_D1(out, 0, dst + 16);
2985        dst += dst_stride;
2986        out = PCKEV_XORI128_UB(out1_r, out1_l);
2987        ST_UB(out, dst);
2988        out = PCKEV_XORI128_UB(out3_r, out3_r);
2989        ST_D1(out, 0, dst + 16);
2990        dst += dst_stride;
2991    }
2992}
2993
2994static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2995                                 uint8_t *dst, int32_t dst_stride,
2996                                 const int8_t *filter, int32_t height)
2997{
2998    uint32_t loop_cnt;
2999    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3000    v16i8 src10_r, src32_r, src76_r, src98_r;
3001    v16i8 src21_r, src43_r, src87_r, src109_r;
3002    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3003    v16i8 src10_l, src32_l, src76_l, src98_l;
3004    v16i8 src21_l, src43_l, src87_l, src109_l;
3005    v8i16 filt;
3006    v16i8 filt0, filt1;
3007    v16u8 out;
3008
3009    src -= src_stride;
3010
3011    filt = LD_SH(filter);
3012    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3013
3014    /* 16 width */
3015    LD_SB3(src, src_stride, src0, src1, src2);
3016    XORI_B3_128_SB(src0, src1, src2);
3017
3018    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3019    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3020
3021    /* next 16 width */
3022    LD_SB3(src + 16, src_stride, src6, src7, src8);
3023    src += (3 * src_stride);
3024
3025    XORI_B3_128_SB(src6, src7, src8);
3026    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3027    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3028
3029    for (loop_cnt = (height >> 1); loop_cnt--;) {
3030        /* 16 width */
3031        LD_SB2(src, src_stride, src3, src4);
3032        XORI_B2_128_SB(src3, src4);
3033        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3034        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3035
3036        /* 16 width */
3037        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3038        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3039        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3040        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3041
3042        /* 16 width */
3043        SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3044        SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3045        out = PCKEV_XORI128_UB(out0_r, out0_l);
3046        ST_UB(out, dst);
3047        out = PCKEV_XORI128_UB(out1_r, out1_l);
3048        ST_UB(out, dst + dst_stride);
3049
3050        src10_r = src32_r;
3051        src21_r = src43_r;
3052        src10_l = src32_l;
3053        src21_l = src43_l;
3054        src2 = src4;
3055
3056        /* next 16 width */
3057        LD_SB2(src + 16, src_stride, src9, src10);
3058        src += (2 * src_stride);
3059        XORI_B2_128_SB(src9, src10);
3060        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3061        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3062
3063        /* next 16 width */
3064        out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3065        out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3066        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3067        out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3068
3069        /* next 16 width */
3070        SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3071        SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3072        out = PCKEV_XORI128_UB(out2_r, out2_l);
3073        ST_UB(out, dst + 16);
3074        out = PCKEV_XORI128_UB(out3_r, out3_l);
3075        ST_UB(out, dst + 16 + dst_stride);
3076
3077        dst += 2 * dst_stride;
3078
3079        src76_r = src98_r;
3080        src87_r = src109_r;
3081        src76_l = src98_l;
3082        src87_l = src109_l;
3083        src8 = src10;
3084    }
3085}
3086
3087static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
3088                                   int32_t src_stride,
3089                                   uint8_t *dst,
3090                                   int32_t dst_stride,
3091                                   const int8_t *filter_x,
3092                                   const int8_t *filter_y)
3093{
3094    v16u8 out;
3095    v16i8 src0, src1, src2, src3, src4;
3096    v8i16 filt0, filt1;
3097    v8i16 filt_h0, filt_h1;
3098    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3099    v16i8 mask1;
3100    v8i16 filter_vec, tmp;
3101    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3102    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3103    v4i32 dst0, dst1;
3104
3105    src -= (src_stride + 1);
3106
3107    filter_vec = LD_SH(filter_x);
3108    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3109
3110    filter_vec = LD_SH(filter_y);
3111    UNPCK_R_SB_SH(filter_vec, filter_vec);
3112
3113    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3114
3115    mask1 = mask0 + 2;
3116
3117    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3118    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3119
3120    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3121    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3122    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3123
3124    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3125    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3126    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3127
3128    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3129    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3130
3131    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3132    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3133    dst0 >>= 6;
3134    dst1 >>= 6;
3135    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3136    tmp = __msa_srari_h(tmp, 6);
3137    tmp = __msa_sat_s_h(tmp, 7);
3138    out = PCKEV_XORI128_UB(tmp, tmp);
3139    ST_W2(out, 0, 1, dst, dst_stride);
3140}
3141
3142static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
3143                                   int32_t src_stride,
3144                                   uint8_t *dst,
3145                                   int32_t dst_stride,
3146                                   const int8_t *filter_x,
3147                                   const int8_t *filter_y)
3148{
3149    v16u8 out;
3150    v16i8 src0, src1, src2, src3, src4, src5, src6;
3151    v8i16 filt0, filt1;
3152    v8i16 filt_h0, filt_h1;
3153    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3154    v16i8 mask1;
3155    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3156    v8i16 filter_vec, tmp0, tmp1;
3157    v8i16 dst30, dst41, dst52, dst63;
3158    v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3159    v4i32 dst0, dst1, dst2, dst3;
3160
3161    src -= (src_stride + 1);
3162
3163    filter_vec = LD_SH(filter_x);
3164    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3165
3166    filter_vec = LD_SH(filter_y);
3167    UNPCK_R_SB_SH(filter_vec, filter_vec);
3168
3169    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3170
3171    mask1 = mask0 + 2;
3172
3173    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3174    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3175
3176    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3177    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3178    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3179    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3180
3181    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3182    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3183    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3184    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3185
3186    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3187    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3188    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3189    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3190    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3191    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3192    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3193    SRA_4V(dst0, dst1, dst2, dst3, 6);
3194    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3195    SRARI_H2_SH(tmp0, tmp1, 6);
3196    SAT_SH2_SH(tmp0, tmp1, 7);
3197    out = PCKEV_XORI128_UB(tmp0, tmp1);
3198    ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3199}
3200
3201static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3202                                           int32_t src_stride,
3203                                           uint8_t *dst,
3204                                           int32_t dst_stride,
3205                                           const int8_t *filter_x,
3206                                           const int8_t *filter_y,
3207                                           int32_t height)
3208{
3209    uint32_t loop_cnt;
3210    v16u8 out0, out1;
3211    v16i8 src0, src1, src2, src3, src4, src5;
3212    v16i8 src6, src7, src8, src9, src10;
3213    v8i16 filt0, filt1;
3214    v8i16 filt_h0, filt_h1;
3215    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3216    v16i8 mask1;
3217    v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3218    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3219    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3220    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3221    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3222    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3223    v8i16 dst98_r, dst109_r;
3224
3225    src -= (src_stride + 1);
3226
3227    filter_vec = LD_SH(filter_x);
3228    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3229
3230    filter_vec = LD_SH(filter_y);
3231    UNPCK_R_SB_SH(filter_vec, filter_vec);
3232
3233    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3234
3235    mask1 = mask0 + 2;
3236
3237    LD_SB3(src, src_stride, src0, src1, src2);
3238    src += (3 * src_stride);
3239
3240    XORI_B3_128_SB(src0, src1, src2);
3241
3242    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3243    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3244    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3245    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3246    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3247    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3248
3249    for (loop_cnt = height >> 3; loop_cnt--;) {
3250        LD_SB8(src, src_stride,
3251               src3, src4, src5, src6, src7, src8, src9, src10);
3252        src += (8 * src_stride);
3253
3254        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3255
3256        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3257        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3258        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3259        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3260
3261        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3262        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3263        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3264        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3265
3266        dst32_r = __msa_ilvr_h(dst73, dst22);
3267        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3268        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3269        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3270        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3271        dst76_r = __msa_ilvr_h(dst22, dst106);
3272
3273        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3274        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3275        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3276        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3277        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3278        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3279        dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3280        dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3281        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3282        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3283        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3284                    dst5_r, dst4_r, dst7_r, dst6_r,
3285                    tmp0, tmp1, tmp2, tmp3);
3286        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3287        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3288        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3289        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3290        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3291        dst += (8 * dst_stride);
3292
3293        dst10_r = dst98_r;
3294        dst21_r = dst109_r;
3295        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3296    }
3297}
3298
3299static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3300                                  int32_t src_stride,
3301                                  uint8_t *dst,
3302                                  int32_t dst_stride,
3303                                  const int8_t *filter_x,
3304                                  const int8_t *filter_y,
3305                                  int32_t height)
3306{
3307    if (2 == height) {
3308        hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3309                               filter_x, filter_y);
3310    } else if (4 == height) {
3311        hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3312                               filter_x, filter_y);
3313    } else if (0 == (height % 8)) {
3314        hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3315                                       filter_x, filter_y, height);
3316    }
3317}
3318
3319static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3320                                  int32_t src_stride,
3321                                  uint8_t *dst,
3322                                  int32_t dst_stride,
3323                                  const int8_t *filter_x,
3324                                  const int8_t *filter_y,
3325                                  int32_t height)
3326{
3327    v16u8 out0, out1, out2;
3328    v16i8 src0, src1, src2, src3, src4, src5, src6;
3329    v16i8 src7, src8, src9, src10;
3330    v8i16 filt0, filt1;
3331    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3332    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3333    v16i8 mask1;
3334    v8i16 filt_h0, filt_h1, filter_vec;
3335    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3336    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3337    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3338    v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3339    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3340    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3341    v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3342    v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3343    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3344
3345    src -= (src_stride + 1);
3346
3347    filter_vec = LD_SH(filter_x);
3348    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3349
3350    filter_vec = LD_SH(filter_y);
3351    UNPCK_R_SB_SH(filter_vec, filter_vec);
3352
3353    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3354
3355    mask1 = mask0 + 2;
3356
3357    LD_SB3(src, src_stride, src0, src1, src2);
3358    src += (3 * src_stride);
3359
3360    XORI_B3_128_SB(src0, src1, src2);
3361
3362    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3363    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3364    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3365
3366    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3367    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3368    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3369
3370    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3371    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3372
3373    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3374    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3375
3376    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3378    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3379    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3380
3381    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3382    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3383    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3384    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3385
3386    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3387    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3388    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3389    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3390
3391    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3392    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3393    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3394    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3395
3396    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3397    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3398    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3399    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3400    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3401    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3402    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3403    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3404
3405    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3406    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3407    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3408
3409    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3410    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3411    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3412    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3413    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3414    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3415    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3416    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3417    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3418    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3419    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3420    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3421    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3422    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3423    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3424    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3425    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3426    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3427    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3428    SRARI_H2_SH(tmp4, tmp5, 6);
3429    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3430    SAT_SH2_SH(tmp4, tmp5,7);
3431    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3432    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3433    out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3434    ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3435    ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3436}
3437
3438static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3439                                   int32_t src_stride,
3440                                   uint8_t *dst,
3441                                   int32_t dst_stride,
3442                                   const int8_t *filter_x,
3443                                   const int8_t *filter_y)
3444{
3445    v16u8 out;
3446    v16i8 src0, src1, src2, src3, src4;
3447    v8i16 filt0, filt1;
3448    v8i16 filt_h0, filt_h1, filter_vec;
3449    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3450    v16i8 mask1;
3451    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3452    v8i16 dst0, dst1, dst2, dst3, dst4;
3453    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3456    v8i16 out0_r, out1_r;
3457
3458    src -= (src_stride + 1);
3459
3460    filter_vec = LD_SH(filter_x);
3461    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3462
3463    filter_vec = LD_SH(filter_y);
3464    UNPCK_R_SB_SH(filter_vec, filter_vec);
3465
3466    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3467
3468    mask1 = mask0 + 2;
3469
3470    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3471    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3472
3473    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3475    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3476    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3477    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3478
3479    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3480    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3481    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3482    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3483    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3484    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3485    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3486    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3487    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3488    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3489    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3490    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3491    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3492    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3493    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3494    SRARI_H2_SH(out0_r, out1_r, 6);
3495    SAT_SH2_SH(out0_r, out1_r, 7);
3496    out = PCKEV_XORI128_UB(out0_r, out1_r);
3497    ST_D2(out, 0, 1, dst, dst_stride);
3498}
3499
3500static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
3501                                       int32_t src_stride,
3502                                       uint8_t *dst,
3503                                       int32_t dst_stride,
3504                                       const int8_t *filter_x,
3505                                       const int8_t *filter_y,
3506                                       int32_t width8mult)
3507{
3508    uint32_t cnt;
3509    v16u8 out0, out1;
3510    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3511    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3512    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3513    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3514    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3516    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3517
3518    src -= (src_stride + 1);
3519
3520    filter_vec = LD_SH(filter_x);
3521    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3522
3523    filter_vec = LD_SH(filter_y);
3524    UNPCK_R_SB_SH(filter_vec, filter_vec);
3525
3526    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3527
3528    mask0 = LD_SB(ff_hevc_mask_arr);
3529    mask1 = mask0 + 2;
3530
3531    for (cnt = width8mult; cnt--;) {
3532        LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3533        src += 8;
3534        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3535
3536        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3537        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3538        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3539
3540        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3541        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3542        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3543
3544        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3545        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3546
3547        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3548        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3549        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3550        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3551
3552        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3553        dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3554        dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3555        dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3556
3557        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3558        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3559        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3560        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3561
3562        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3563        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3564        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3565        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3566        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3567        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3568        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3569        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3570
3571        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3572        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3573
3574        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3575                    dst3_r, tmp0, tmp1, tmp2, tmp3);
3576        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3577        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3578        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3579        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3580        ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3581        dst += 8;
3582    }
3583}
3584
3585static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3586                                   int32_t src_stride,
3587                                   uint8_t *dst,
3588                                   int32_t dst_stride,
3589                                   const int8_t *filter_x,
3590                                   const int8_t *filter_y)
3591{
3592    v16u8 out0, out1, out2;
3593    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3594    v8i16 filt0, filt1;
3595    v8i16 filt_h0, filt_h1, filter_vec;
3596    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3597    v16i8 mask1;
3598    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3599    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3600    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3601    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3602    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3603    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3604    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3605    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3606    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3607    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3608
3609    src -= (src_stride + 1);
3610
3611    filter_vec = LD_SH(filter_x);
3612    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3613
3614    filter_vec = LD_SH(filter_y);
3615    UNPCK_R_SB_SH(filter_vec, filter_vec);
3616
3617    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3618
3619    mask1 = mask0 + 2;
3620
3621    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3622    src += (5 * src_stride);
3623    LD_SB4(src, src_stride, src5, src6, src7, src8);
3624
3625    XORI_B5_128_SB(src0, src1, src2, src3, src4);
3626    XORI_B4_128_SB(src5, src6, src7, src8);
3627
3628    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3629    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3630    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3632    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3633    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3634    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3635    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3636    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3637
3638    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3639    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3640    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3641    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3642    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3643    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3644    dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3645    dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3646    dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3647
3648    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3649    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3650    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3651    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3652    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3653    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3654    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3655    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3656
3657    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3658    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3659    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3660    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3661    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3662    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3663    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3664    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3665    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3666    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3667    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3668    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3669
3670    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3671    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3672    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3673    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3674                dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3675    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3676    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3677    SRARI_H2_SH(out4_r, out5_r, 6);
3678    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3679    SAT_SH2_SH(out4_r, out5_r, 7);
3680    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3681    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3682    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3683
3684    ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3685    ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3686}
3687
3688static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
3689                                           int32_t src_stride,
3690                                           uint8_t *dst,
3691                                           int32_t dst_stride,
3692                                           const int8_t *filter_x,
3693                                           const int8_t *filter_y,
3694                                           int32_t height,
3695                                           int32_t width8mult)
3696{
3697    uint32_t loop_cnt, cnt;
3698    uint8_t *src_tmp;
3699    uint8_t *dst_tmp;
3700    v16u8 out0, out1;
3701    v16i8 src0, src1, src2, src3, src4, src5, src6;
3702    v8i16 filt0, filt1;
3703    v8i16 filt_h0, filt_h1, filter_vec;
3704    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3705    v16i8 mask1;
3706    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3707    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3708    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3709    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3710    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3711    v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3712    v8i16 out0_r, out1_r, out2_r, out3_r;
3713
3714    src -= (src_stride + 1);
3715
3716    filter_vec = LD_SH(filter_x);
3717    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3718
3719    filter_vec = LD_SH(filter_y);
3720    UNPCK_R_SB_SH(filter_vec, filter_vec);
3721
3722    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3723
3724    mask1 = mask0 + 2;
3725
3726    for (cnt = width8mult; cnt--;) {
3727        src_tmp = src;
3728        dst_tmp = dst;
3729
3730        LD_SB3(src_tmp, src_stride, src0, src1, src2);
3731        src_tmp += (3 * src_stride);
3732
3733        XORI_B3_128_SB(src0, src1, src2);
3734
3735        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3736        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3737        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3738
3739        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3740        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3741        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3742
3743        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3744        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3745
3746        for (loop_cnt = (height >> 2); loop_cnt--;) {
3747            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3748            src_tmp += (4 * src_stride);
3749
3750            XORI_B4_128_SB(src3, src4, src5, src6);
3751
3752            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3753            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3754            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3755            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3756
3757            dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3758            dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3759            dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3760            dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3761
3762            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3763            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3764            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3765            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3766
3767            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3768            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3769            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3770            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3771            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3772            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3773            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3774            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3775
3776            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3777            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3778
3779            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3780                        dst2_l, dst2_r, dst3_l, dst3_r,
3781                        out0_r, out1_r, out2_r, out3_r);
3782
3783            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3784            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3785            out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3786            out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3787            ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3788            dst_tmp += (4 * dst_stride);
3789
3790            dst10_r = dst54_r;
3791            dst10_l = dst54_l;
3792            dst21_r = dst65_r;
3793            dst21_l = dst65_l;
3794            dst2 = dst6;
3795        }
3796
3797        src += 8;
3798        dst += 8;
3799    }
3800}
3801
3802static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3803                                  int32_t src_stride,
3804                                  uint8_t *dst,
3805                                  int32_t dst_stride,
3806                                  const int8_t *filter_x,
3807                                  const int8_t *filter_y,
3808                                  int32_t height)
3809{
3810    if (2 == height) {
3811        hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3812                               filter_x, filter_y);
3813    } else if (4 == height) {
3814        hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3815                                   filter_x, filter_y, 1);
3816    } else if (6 == height) {
3817        hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3818                               filter_x, filter_y);
3819    } else if (0 == (height % 4)) {
3820        hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3821                                       filter_x, filter_y, height, 1);
3822    }
3823}
3824
3825static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3826                                   int32_t src_stride,
3827                                   uint8_t *dst,
3828                                   int32_t dst_stride,
3829                                   const int8_t *filter_x,
3830                                   const int8_t *filter_y,
3831                                   int32_t height)
3832{
3833    uint32_t loop_cnt;
3834    uint8_t *src_tmp, *dst_tmp;
3835    v16u8 out0, out1;
3836    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3837    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3838    v16i8 mask0, mask1, mask2, mask3;
3839    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3840    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3841    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3842    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3843    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3844    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3845    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3846    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3847
3848    src -= (src_stride + 1);
3849
3850    filter_vec = LD_SH(filter_x);
3851    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3852
3853    filter_vec = LD_SH(filter_y);
3854    UNPCK_R_SB_SH(filter_vec, filter_vec);
3855
3856    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3857
3858    mask0 = LD_SB(ff_hevc_mask_arr);
3859    mask1 = mask0 + 2;
3860
3861    src_tmp = src;
3862    dst_tmp = dst;
3863
3864    LD_SB3(src_tmp, src_stride, src0, src1, src2);
3865    src_tmp += (3 * src_stride);
3866
3867    XORI_B3_128_SB(src0, src1, src2);
3868
3869    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3870    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3871    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3872
3873    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3874    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3875    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3876
3877    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3878    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3879
3880    for (loop_cnt = 4; loop_cnt--;) {
3881        LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3882        src_tmp += (4 * src_stride);
3883        XORI_B4_128_SB(src3, src4, src5, src6);
3884
3885        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3886        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3887        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3888        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3889
3890        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3891        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3892        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3893        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3894
3895        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3896        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3897        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3898        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3899
3900        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3901        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3902        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3903        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3904        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3905        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3906        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3907        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3908
3909        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3910        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3911
3912        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3913                    dst3_r, tmp0, tmp1, tmp2, tmp3);
3914        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3915        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3916        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3917        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3918        ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3919        dst_tmp += (4 * dst_stride);
3920
3921        dst10_r = dst54_r;
3922        dst10_l = dst54_l;
3923        dst21_r = dst65_r;
3924        dst21_l = dst65_l;
3925        dsth2 = dsth6;
3926    }
3927
3928    src += 8;
3929    dst += 8;
3930
3931    mask2 = LD_SB(ff_hevc_mask_arr + 16);
3932    mask3 = mask2 + 2;
3933
3934    LD_SB3(src, src_stride, src0, src1, src2);
3935    src += (3 * src_stride);
3936    XORI_B3_128_SB(src0, src1, src2);
3937    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3938    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3939
3940    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3941    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3942
3943    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3944    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3945
3946    for (loop_cnt = 2; loop_cnt--;) {
3947        LD_SB8(src, src_stride,
3948               src3, src4, src5, src6, src7, src8, src9, src10);
3949        src += (8 * src_stride);
3950        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3951        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3952        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3953        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3954        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3955
3956        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3957        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3958        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3959        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3960
3961        dst32_r = __msa_ilvr_h(dst73, dst22);
3962        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3963        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3964        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3965        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3966        dst76_r = __msa_ilvr_h(dst22, dst106);
3967
3968        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3969        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3970        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3971        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3972        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3973        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3974        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3975        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3976        SRA_4V(dst0, dst1, dst2, dst3, 6);
3977        SRA_4V(dst4, dst5, dst6, dst7, 6);
3978        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3979                    tmp0, tmp1, tmp2, tmp3);
3980        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3981        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3982        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3983        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3984        ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3985        dst += (8 * dst_stride);
3986
3987        dst10_r = dst98_r;
3988        dst21_r = dst109_r;
3989        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3990    }
3991}
3992
3993static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3994                                   int32_t src_stride,
3995                                   uint8_t *dst,
3996                                   int32_t dst_stride,
3997                                   const int8_t *filter_x,
3998                                   const int8_t *filter_y,
3999                                   int32_t height)
4000{
4001    if (4 == height) {
4002        hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4003                                   filter_y, 2);
4004    } else {
4005        hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4006                                       filter_x, filter_y, height, 2);
4007    }
4008}
4009
4010static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
4011                                   int32_t src_stride,
4012                                   uint8_t *dst,
4013                                   int32_t dst_stride,
4014                                   const int8_t *filter_x,
4015                                   const int8_t *filter_y,
4016                                   int32_t height)
4017{
4018    hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4019                                   filter_x, filter_y, height, 3);
4020}
4021
4022static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
4023                                   int32_t src_stride,
4024                                   uint8_t *dst,
4025                                   int32_t dst_stride,
4026                                   const int8_t *filter_x,
4027                                   const int8_t *filter_y,
4028                                   int32_t height)
4029{
4030    hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4031                                   filter_x, filter_y, height, 4);
4032}
4033
4034#define UNI_MC_COPY(WIDTH)                                                 \
4035void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
4036                                                    ptrdiff_t dst_stride,  \
4037                                                    uint8_t *src,          \
4038                                                    ptrdiff_t src_stride,  \
4039                                                    int height,            \
4040                                                    intptr_t mx,           \
4041                                                    intptr_t my,           \
4042                                                    int width)             \
4043{                                                                          \
4044    copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height);     \
4045}
4046
4047UNI_MC_COPY(8);
4048UNI_MC_COPY(12);
4049UNI_MC_COPY(16);
4050UNI_MC_COPY(24);
4051UNI_MC_COPY(32);
4052UNI_MC_COPY(48);
4053UNI_MC_COPY(64);
4054
4055#undef UNI_MC_COPY
4056
4057#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                           \
4058void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \
4059                                                       ptrdiff_t dst_stride,   \
4060                                                       uint8_t *src,           \
4061                                                       ptrdiff_t src_stride,   \
4062                                                       int height,             \
4063                                                       intptr_t mx,            \
4064                                                       intptr_t my,            \
4065                                                       int width)              \
4066{                                                                              \
4067    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
4068                                                                               \
4069    common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
4070                                            filter, height);                   \
4071}
4072
4073UNI_MC(qpel, h, 4, 8, hz, mx);
4074UNI_MC(qpel, h, 8, 8, hz, mx);
4075UNI_MC(qpel, h, 12, 8, hz, mx);
4076UNI_MC(qpel, h, 16, 8, hz, mx);
4077UNI_MC(qpel, h, 24, 8, hz, mx);
4078UNI_MC(qpel, h, 32, 8, hz, mx);
4079UNI_MC(qpel, h, 48, 8, hz, mx);
4080UNI_MC(qpel, h, 64, 8, hz, mx);
4081
4082UNI_MC(qpel, v, 4, 8, vt, my);
4083UNI_MC(qpel, v, 8, 8, vt, my);
4084UNI_MC(qpel, v, 12, 8, vt, my);
4085UNI_MC(qpel, v, 16, 8, vt, my);
4086UNI_MC(qpel, v, 24, 8, vt, my);
4087UNI_MC(qpel, v, 32, 8, vt, my);
4088UNI_MC(qpel, v, 48, 8, vt, my);
4089UNI_MC(qpel, v, 64, 8, vt, my);
4090
4091UNI_MC(epel, h, 4, 4, hz, mx);
4092UNI_MC(epel, h, 6, 4, hz, mx);
4093UNI_MC(epel, h, 8, 4, hz, mx);
4094UNI_MC(epel, h, 12, 4, hz, mx);
4095UNI_MC(epel, h, 16, 4, hz, mx);
4096UNI_MC(epel, h, 24, 4, hz, mx);
4097UNI_MC(epel, h, 32, 4, hz, mx);
4098
4099UNI_MC(epel, v, 4, 4, vt, my);
4100UNI_MC(epel, v, 6, 4, vt, my);
4101UNI_MC(epel, v, 8, 4, vt, my);
4102UNI_MC(epel, v, 12, 4, vt, my);
4103UNI_MC(epel, v, 16, 4, vt, my);
4104UNI_MC(epel, v, 24, 4, vt, my);
4105UNI_MC(epel, v, 32, 4, vt, my);
4106
4107#undef UNI_MC
4108
4109#define UNI_MC_HV(PEL, WIDTH, TAP)                                         \
4110void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \
4111                                                    ptrdiff_t dst_stride,  \
4112                                                    uint8_t *src,          \
4113                                                    ptrdiff_t src_stride,  \
4114                                                    int height,            \
4115                                                    intptr_t mx,           \
4116                                                    intptr_t my,           \
4117                                                    int width)             \
4118{                                                                          \
4119    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];              \
4120    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];              \
4121                                                                           \
4122    hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
4123                                        filter_x, filter_y, height);       \
4124}
4125
4126UNI_MC_HV(qpel, 4, 8);
4127UNI_MC_HV(qpel, 8, 8);
4128UNI_MC_HV(qpel, 12, 8);
4129UNI_MC_HV(qpel, 16, 8);
4130UNI_MC_HV(qpel, 24, 8);
4131UNI_MC_HV(qpel, 32, 8);
4132UNI_MC_HV(qpel, 48, 8);
4133UNI_MC_HV(qpel, 64, 8);
4134
4135UNI_MC_HV(epel, 4, 4);
4136UNI_MC_HV(epel, 6, 4);
4137UNI_MC_HV(epel, 8, 4);
4138UNI_MC_HV(epel, 12, 4);
4139UNI_MC_HV(epel, 16, 4);
4140UNI_MC_HV(epel, 24, 4);
4141UNI_MC_HV(epel, 32, 4);
4142
4143#undef UNI_MC_HV
4144