1/*
2 * Copyright (c) 2022 Loongson Technology Corporation Limited
3 * Contributed by Lu Wang <wanglu@loongson.cn>
4 *                Hao Chen <chenhao@loongson.cn>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/loongarch/loongson_intrinsics.h"
24#include "hevcdsp_lsx.h"
25
26static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27    /* 8 width cases */
28    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30};
31
32static av_always_inline __m128i
33hevc_bi_rnd_clip(__m128i in0, __m128i vec0, __m128i in1, __m128i vec1)
34{
35    __m128i out;
36
37    vec0 = __lsx_vsadd_h(in0, vec0);
38    vec1 = __lsx_vsadd_h(in1, vec1);
39    out  = __lsx_vssrarni_bu_h(vec1, vec0, 7);
40    return out;
41}
42
43/* hevc_bi_copy: dst = av_clip_uint8((src0 << 6 + src1) >> 7) */
44static
45void hevc_bi_copy_4w_lsx(uint8_t *src0_ptr, int32_t src_stride,
46                         int16_t *src1_ptr, int32_t src2_stride,
47                         uint8_t *dst, int32_t dst_stride, int32_t height)
48{
49    int32_t loop_cnt = height >> 3;
50    int32_t res = (height & 0x07) >> 1;
51    int32_t src_stride_2x = (src_stride << 1);
52    int32_t dst_stride_2x = (dst_stride << 1);
53    int32_t src_stride_4x = (src_stride << 2);
54    int32_t dst_stride_4x = (dst_stride << 2);
55    int32_t src2_stride_2x = (src2_stride << 1);
56    int32_t src2_stride_4x = (src2_stride << 2);
57    int32_t src_stride_3x = src_stride_2x + src_stride;
58    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
59    int32_t src2_stride_3x = src2_stride_2x + src2_stride;
60    __m128i src0, src1;
61    __m128i zero = __lsx_vldi(0);
62    __m128i in0, in1, in2, in3;
63    __m128i tmp0, tmp1, tmp2, tmp3;
64    __m128i reg0, reg1, reg2, reg3;
65    __m128i dst0, dst1, dst2, dst3;
66
67    for (;loop_cnt--;) {
68        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
69        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
70        reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
71        reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
72        src0_ptr += src_stride_4x;
73        DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
74        src0 = __lsx_vilvl_d(tmp1, tmp0);
75        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
76        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
77        reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
78        reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
79        DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
80        src1 = __lsx_vilvl_d(tmp1, tmp0);
81        src0_ptr += src_stride_4x;
82
83        tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
84        tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
85        tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
86        tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
87        src1_ptr += src2_stride_4x;
88        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in0, in1);
89        tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
90        tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
91        tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
92        tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
93        src1_ptr += src2_stride_4x;
94        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in2, in3);
95        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst0, dst2);
96        DUP2_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, dst1, dst3);
97        DUP2_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst1, dst3);
98        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
99        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
100        __lsx_vstelm_w(dst0, dst, 0, 0);
101        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
102        __lsx_vstelm_w(dst0, dst + dst_stride_2x, 0, 2);
103        __lsx_vstelm_w(dst0, dst + dst_stride_3x, 0, 3);
104        dst += dst_stride_4x;
105        __lsx_vstelm_w(dst1, dst, 0, 0);
106        __lsx_vstelm_w(dst1, dst + dst_stride, 0, 1);
107        __lsx_vstelm_w(dst1, dst + dst_stride_2x, 0, 2);
108        __lsx_vstelm_w(dst1, dst + dst_stride_3x, 0, 3);
109        dst += dst_stride_4x;
110    }
111    for(;res--;) {
112        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
113        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
114        reg2 = __lsx_vldrepl_d(src1_ptr, 0);
115        reg3 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
116        src0 = __lsx_vilvl_w(reg1, reg0);
117        in0  = __lsx_vilvl_d(reg3, reg2);
118        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
119        dst0 = __lsx_vsadd_h(dst0, in0);
120        dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7);
121        __lsx_vstelm_w(dst0, dst, 0, 0);
122        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
123        src0_ptr += src_stride_2x;
124        src1_ptr += src2_stride_2x;
125        dst += dst_stride_2x;
126    }
127}
128
129static
130void hevc_bi_copy_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
131                         int16_t *src1_ptr, int32_t src2_stride,
132                         uint8_t *dst, int32_t dst_stride, int32_t height)
133{
134    int32_t loop_cnt;
135    int32_t res = (height & 0x07) >> 1;
136    int32_t src_stride_2x = (src_stride << 1);
137    int32_t dst_stride_2x = (dst_stride << 1);
138    int32_t src_stride_4x = (src_stride << 2);
139    int32_t dst_stride_4x = (dst_stride << 2);
140    int32_t src2_stride_x = (src2_stride << 1);
141    int32_t src2_stride_2x = (src2_stride << 2);
142    int32_t src_stride_3x = src_stride_2x + src_stride;
143    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
144    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
145    __m128i out0, out1, out2, out3;
146    __m128i zero = __lsx_vldi(0);
147    __m128i src0, src1, src2, src3;
148    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
149    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
150    __m128i reg0, reg1, reg2, reg3;
151
152    for (loop_cnt = (height >> 3); loop_cnt--;) {
153        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
154        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
155        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
156        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
157        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
158        src0_ptr += src_stride_4x;
159        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
160        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
161        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
162        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
163        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
164        src0_ptr += src_stride_4x;
165        in0 = __lsx_vld(src1_ptr, 0);
166        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
167                  src2_stride_2x, in1, in2);
168        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
169        src1_ptr += src2_stride_2x;
170        in4 = __lsx_vld(src1_ptr, 0);
171        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
172                  src2_stride_2x, in5, in6);
173        in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
174        src1_ptr += src2_stride_2x;
175        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
176                  dst0, dst2, dst4, dst6);
177        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
178                  dst1, dst3, dst5, dst7);
179        DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, dst3,
180                  dst5, dst7);
181        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
182        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
183        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
184        out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
185        __lsx_vstelm_w(out0, dst, 0, 0);
186        __lsx_vstelm_w(out0, dst + dst_stride, 0, 2);
187        __lsx_vstelm_h(out0, dst, 4, 2);
188        __lsx_vstelm_h(out0, dst + dst_stride, 4, 6);
189        __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 0);
190        __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 2);
191        __lsx_vstelm_h(out1, dst + dst_stride_2x, 4, 2);
192        __lsx_vstelm_h(out1, dst + dst_stride_3x, 4, 6);
193        dst += dst_stride_4x;
194        __lsx_vstelm_w(out2, dst, 0, 0);
195        __lsx_vstelm_w(out2, dst + dst_stride, 0, 2);
196        __lsx_vstelm_h(out2, dst, 4, 2);
197        __lsx_vstelm_h(out2, dst + dst_stride, 4, 6);
198        __lsx_vstelm_w(out3, dst + dst_stride_2x, 0, 0);
199        __lsx_vstelm_w(out3, dst + dst_stride_3x, 0, 2);
200        __lsx_vstelm_h(out3, dst + dst_stride_2x, 4, 2);
201        __lsx_vstelm_h(out3, dst + dst_stride_3x, 4, 6);
202        dst += dst_stride_4x;
203    }
204    for (;res--;) {
205        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
206        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
207        src0 = __lsx_vilvl_d(reg1, reg0);
208        src0_ptr += src_stride_2x;
209        in0 = __lsx_vld(src1_ptr, 0);
210        in1 = __lsx_vldx(src1_ptr, src2_stride_x);
211        src1_ptr += src2_stride_x;
212        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
213        dst1 = __lsx_vilvh_b(zero, src0);
214        dst1 = __lsx_vslli_h(dst1, 6);
215        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
216        __lsx_vstelm_w(out0, dst, 0, 0);
217        __lsx_vstelm_h(out0, dst, 4, 2);
218        dst += dst_stride;
219        __lsx_vstelm_w(out0, dst, 0, 2);
220        __lsx_vstelm_h(out0, dst, 4, 6);
221        dst += dst_stride;
222    }
223}
224
225static
226void hevc_bi_copy_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
227                         int16_t *src1_ptr, int32_t src2_stride,
228                         uint8_t *dst, int32_t dst_stride, int32_t height)
229{
230    int32_t loop_cnt = height >> 3;
231    int32_t res = (height & 7) >> 1;
232    int32_t src_stride_2x = (src_stride << 1);
233    int32_t dst_stride_2x = (dst_stride << 1);
234    int32_t src_stride_4x = (src_stride << 2);
235    int32_t dst_stride_4x = (dst_stride << 2);
236    int32_t src2_stride_x = (src2_stride << 1);
237    int32_t src2_stride_2x = (src2_stride << 2);
238    int32_t src_stride_3x = src_stride_2x + src_stride;
239    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
240    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
241    __m128i out0, out1, out2, out3;
242    __m128i src0, src1, src2, src3;
243    __m128i zero = __lsx_vldi(0);
244    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
245    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
246    __m128i reg0, reg1, reg2, reg3;
247
248    for (loop_cnt = (height >> 3); loop_cnt--;) {
249        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
250        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
251        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
252        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
253        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
254        src0_ptr += src_stride_4x;
255        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
256        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
257        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
258        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
259        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
260        src0_ptr += src_stride_4x;
261        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
262                  dst0, dst2, dst4, dst6);
263        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
264                  src3, dst1, dst3, dst5, dst7);
265        DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1,
266                  dst3, dst5, dst7);
267        in0 = __lsx_vld(src1_ptr, 0);
268        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
269                  src2_stride_2x, in1, in2);
270        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
271        src1_ptr += src2_stride_2x;
272        in4 = __lsx_vld(src1_ptr, 0);
273        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
274                  src2_stride_2x, in5, in6);
275        in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
276        src1_ptr += src2_stride_2x;
277        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
278        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
279        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
280        out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
281        __lsx_vstelm_d(out0, dst, 0, 0);
282        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
283        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
284        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
285        dst += dst_stride_4x;
286        __lsx_vstelm_d(out2, dst, 0, 0);
287        __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
288        __lsx_vstelm_d(out3, dst + dst_stride_2x, 0, 0);
289        __lsx_vstelm_d(out3, dst + dst_stride_3x, 0, 1);
290        dst += dst_stride_4x;
291    }
292    for (;res--;) {
293        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
294        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
295        src0 = __lsx_vilvl_d(reg1, reg0);
296        in0  = __lsx_vld(src1_ptr, 0);
297        in1  = __lsx_vldx(src1_ptr, src2_stride_x);
298        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
299        dst1 = __lsx_vilvh_b(zero, src0);
300        dst1 = __lsx_vslli_h(dst1, 6);
301        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
302        __lsx_vstelm_d(out0, dst, 0, 0);
303        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
304        src0_ptr += src_stride_2x;
305        src1_ptr += src2_stride_x;
306        dst += dst_stride_2x;
307    }
308}
309
310static
311void hevc_bi_copy_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
312                          int16_t *src1_ptr, int32_t src2_stride,
313                          uint8_t *dst, int32_t dst_stride, int32_t height)
314{
315    uint32_t loop_cnt;
316    int32_t src_stride_2x = (src_stride << 1);
317    int32_t dst_stride_2x = (dst_stride << 1);
318    int32_t src_stride_4x = (src_stride << 2);
319    int32_t dst_stride_4x = (dst_stride << 2);
320    int32_t src2_stride_x = (src2_stride << 1);
321    int32_t src2_stride_2x = (src2_stride << 2);
322    int32_t src_stride_3x = src_stride_2x + src_stride;
323    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
324    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
325    int16_t* _src1 = src1_ptr + 8;
326    __m128i out0, out1, out2;
327    __m128i src0, src1, src2, src3;
328    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
329    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
330
331    for (loop_cnt = 4; loop_cnt--;) {
332        src0 = __lsx_vld(src0_ptr, 0);
333        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
334                  src1, src2);
335        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
336        src0_ptr += src_stride_4x;
337        in0 = __lsx_vld(src1_ptr, 0);
338        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
339                  src2_stride_2x, in1, in2);
340        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
341        src1_ptr += src2_stride_2x;
342        in4 = __lsx_vld(_src1, 0);
343        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
344                  in5, in6);
345        in7 = __lsx_vldx(_src1, src2_stride_3x);
346        _src1 += src2_stride_2x;
347
348        DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
349        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
350                  dst0, dst1, dst2, dst3)
351        DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
352        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst4, dst5)
353        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
354        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
355        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
356        __lsx_vstelm_d(out0, dst, 0, 0);
357        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
358        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
359        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
360        __lsx_vstelm_w(out2, dst, 8, 0);
361        __lsx_vstelm_w(out2, dst + dst_stride, 8, 1);
362        __lsx_vstelm_w(out2, dst + dst_stride_2x, 8, 2);
363        __lsx_vstelm_w(out2, dst + dst_stride_3x, 8, 3);
364        dst += dst_stride_4x;
365    }
366}
367
368static
369void hevc_bi_copy_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
370                          int16_t *src1_ptr, int32_t src2_stride,
371                          uint8_t *dst, int32_t dst_stride, int32_t height)
372{
373    uint32_t loop_cnt;
374    int32_t src_stride_2x = (src_stride << 1);
375    int32_t dst_stride_2x = (dst_stride << 1);
376    int32_t src_stride_4x = (src_stride << 2);
377    int32_t dst_stride_4x = (dst_stride << 2);
378    int32_t src2_stride_x = (src2_stride << 1);
379    int32_t src2_stride_2x = (src2_stride << 2);
380    int32_t src_stride_3x = src_stride_2x + src_stride;
381    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
382    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
383    int16_t *_src1 = src1_ptr + 8;
384    __m128i out0, out1, out2, out3;
385    __m128i src0, src1, src2, src3;
386    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
387    __m128i dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
388    __m128i zero = {0};
389
390    for (loop_cnt = (height >> 2); loop_cnt--;) {
391        src0 = __lsx_vld(src0_ptr, 0);
392        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
393                  src1, src2);
394        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
395        src0_ptr += src_stride_4x;
396        in0 = __lsx_vld(src1_ptr, 0);
397        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
398                  src2_stride_2x, in1, in2);
399        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
400        src1_ptr += src2_stride_2x;
401        in4 = __lsx_vld(_src1, 0);
402        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
403                  in5, in6);
404        in7 = __lsx_vldx(_src1, src2_stride_3x);
405        _src1 += src2_stride_2x;
406        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
407                  dst0_r, dst1_r, dst2_r, dst3_r)
408        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
409                  dst0_l, dst1_l, dst2_l, dst3_l);
410        DUP4_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
411                  dst0_l, dst1_l, dst2_l, dst3_l);
412
413        out0 = hevc_bi_rnd_clip(in0, dst0_r, in4, dst0_l);
414        out1 = hevc_bi_rnd_clip(in1, dst1_r, in5, dst1_l);
415        out2 = hevc_bi_rnd_clip(in2, dst2_r, in6, dst2_l);
416        out3 = hevc_bi_rnd_clip(in3, dst3_r, in7, dst3_l);
417        __lsx_vst(out0, dst, 0);
418        __lsx_vstx(out1, dst, dst_stride);
419        __lsx_vstx(out2, dst, dst_stride_2x);
420        __lsx_vstx(out3, dst, dst_stride_3x);
421        dst += dst_stride_4x;
422    }
423}
424
425static
426void hevc_bi_copy_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
427                          int16_t *src1_ptr, int32_t src2_stride,
428                          uint8_t *dst, int32_t dst_stride, int32_t height)
429{
430    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
431                         dst, dst_stride, height);
432    hevc_bi_copy_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
433                         dst + 16, dst_stride, height);
434}
435
436static
437void hevc_bi_copy_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
438                          int16_t *src1_ptr, int32_t src2_stride,
439                          uint8_t *dst, int32_t dst_stride, int32_t height)
440{
441    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
442                         dst, dst_stride, height);
443    hevc_bi_copy_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
444                         dst + 16, dst_stride, height);
445}
446
447static
448void hevc_bi_copy_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
449                          int16_t *src1_ptr, int32_t src2_stride,
450                          uint8_t *dst, int32_t dst_stride, int32_t height)
451{
452    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
453                         dst, dst_stride, height);
454    hevc_bi_copy_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
455                         dst + 16, dst_stride, height);
456}
457
458static
459void hevc_bi_copy_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
460                          int16_t *src1_ptr, int32_t src2_stride,
461                          uint8_t *dst, int32_t dst_stride, int32_t height)
462{
463    hevc_bi_copy_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
464                         dst, dst_stride, height);
465    hevc_bi_copy_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
466                         dst + 32, dst_stride, height);
467}
468
469static void hevc_hz_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
470                               int16_t *src1_ptr,  int32_t src2_stride,
471                               uint8_t *dst, int32_t dst_stride,
472                               const int8_t *filter, int32_t height)
473{
474    uint32_t loop_cnt;
475    const int32_t dst_stride_2x = (dst_stride << 1);
476    __m128i src0, src1, src2, src3;
477    __m128i filt0, filt1, filt2, filt3;
478    __m128i mask1, mask2, mask3;
479    __m128i vec0, vec1, vec2, vec3;
480    __m128i dst0, dst1, dst2, dst3;
481    __m128i in0, in1, in2, in3;
482    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
483
484    src0_ptr -= 3;
485    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
486              filt0, filt1, filt2, filt3);
487
488    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
489    mask3 = __lsx_vaddi_bu(mask0, 6);
490
491    for (loop_cnt = (height >> 1); loop_cnt--;) {
492        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src0, src1);
493        src0_ptr += src_stride;
494        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src2, src3);
495        src0_ptr += src_stride;
496        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
497        src1_ptr += src2_stride;
498        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
499        src1_ptr += src2_stride;
500
501        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
502                  vec0, vec1);
503        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
504                  vec2, vec3);
505        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
506                  vec3, filt0, dst0, dst1, dst2, dst3);
507        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
508                  vec0, vec1);
509        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
510                  vec2, vec3);
511        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
512                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
513        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
514                  vec0, vec1);
515        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
516                  vec2, vec3);
517        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
518                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
519        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
520                  vec0, vec1);
521        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
522                  vec2, vec3);
523        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
524                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
525
526        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
527        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
528        __lsx_vst(dst0, dst, 0);
529        __lsx_vstx(dst1, dst, dst_stride);
530        dst += dst_stride_2x;
531    }
532}
533
534static void hevc_hz_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
535                               int16_t *src1_ptr, int32_t src2_stride,
536                               uint8_t *dst, int32_t dst_stride,
537                               const int8_t *filter, int32_t height)
538{
539    uint32_t loop_cnt;
540    __m128i src0, src1, tmp0, tmp1;
541    __m128i filt0, filt1, filt2, filt3;
542    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
543    __m128i vec0, vec1, vec2, vec3;
544    __m128i dst0, dst1, dst2;
545    __m128i in0, in1, in2;
546    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
547
548    src0_ptr -= 3;
549    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
550              filt0, filt1, filt2, filt3);
551
552    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
553              mask2, mask3, mask4);
554    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
555    mask7 = __lsx_vaddi_bu(mask0, 14);
556
557    for (loop_cnt = height; loop_cnt--;) {
558        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
559        src0_ptr += src_stride;
560        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
561        in2 = __lsx_vld(src1_ptr, 32);
562        src1_ptr += src2_stride;
563
564        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
565                  src1, mask0, src0, src0, mask1, vec0, vec1, vec2, vec3);
566        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1);
567        dst2 = __lsx_vdp2_h_bu_b(vec2, filt0);
568        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt1);
569        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask5, src1, src1, mask1, src0,
570                  src0, mask2, src1, src0, mask6, vec0, vec1, vec2, vec3);
571        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec0, filt1, dst2, vec1, filt1,
572                  dst0, vec2, filt2, dst1, vec3, filt2, dst1, dst2, dst0, dst1);
573        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask2, src0, src0, mask3, src1, src0,
574                  mask7, src1, src1, mask3, vec0, vec1, vec2, vec3);
575        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec0, filt2, dst0, vec1, filt3,
576                  dst1, vec2, filt3, dst2, vec3, filt3, dst2, dst0, dst1, dst2);
577
578        tmp0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
579        dst2 = __lsx_vsadd_h(dst2, in2);
580        tmp1 = __lsx_vssrarni_bu_h(dst2, dst2, 7);
581
582        __lsx_vst(tmp0, dst, 0);
583        __lsx_vstelm_d(tmp1, dst, 16, 0);
584        dst += dst_stride;
585    }
586}
587
588static void hevc_hz_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
589                               int16_t *src1_ptr, int32_t src2_stride,
590                               uint8_t *dst, int32_t dst_stride,
591                               const int8_t *filter, int32_t height)
592{
593    hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
594                       dst, dst_stride, filter, height);
595    hevc_hz_8t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
596                       dst + 16, dst_stride, filter, height);
597}
598
599static void hevc_hz_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
600                               int16_t *src1_ptr, int32_t src2_stride,
601                               uint8_t *dst, int32_t dst_stride,
602                               const int8_t *filter, int32_t height)
603{
604    hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
605                       dst, dst_stride, filter, height);
606    hevc_hz_8t_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
607                       dst + 16, dst_stride, filter, height);
608}
609
610static void hevc_hz_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
611                               int16_t *src1_ptr, int32_t src2_stride,
612                               uint8_t *dst, int32_t dst_stride,
613                               const int8_t *filter, int32_t height)
614{
615    hevc_hz_8t_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
616                       dst, dst_stride, filter, height);
617    hevc_hz_8t_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
618                       dst + 32, dst_stride, filter, height);
619}
620
621static av_always_inline
622void hevc_vt_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
623                       int32_t src2_stride, uint8_t *dst, int32_t dst_stride,\
624                       const int8_t *filter, int32_t height)
625{
626    int32_t loop_cnt;
627    int32_t src_stride_2x = (src_stride << 1);
628    int32_t dst_stride_2x = (dst_stride << 1);
629    int32_t src_stride_4x = (src_stride << 2);
630    int32_t dst_stride_4x = (dst_stride << 2);
631    int32_t src2_stride_x = (src2_stride << 1);
632    int32_t src2_stride_2x = (src2_stride << 2);
633    int32_t src_stride_3x = src_stride_2x + src_stride;
634    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
635    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
636    __m128i src0, src1, src2, src3, src4, src5;
637    __m128i src6, src7, src8, src9, src10;
638    __m128i in0, in1, in2, in3;
639    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
640    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
641    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
642    __m128i filt0, filt1, filt2, filt3;
643
644    src0_ptr -= src_stride_3x;
645
646    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
647              filt0, filt1, filt2, filt3);
648
649    src0 = __lsx_vld(src0_ptr, 0);
650    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
651              src1, src2);
652    src3 = __lsx_vldx(src0_ptr, src_stride_3x);
653    src0_ptr += src_stride_4x;
654    src4 = __lsx_vld(src0_ptr, 0);
655    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
656              src5, src6);
657    src0_ptr += src_stride_3x;
658    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
659              src10_r, src32_r, src54_r, src21_r);
660    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
661
662    for (loop_cnt = (height >> 2); loop_cnt--;) {
663        src7 = __lsx_vld(src0_ptr, 0);
664        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
665                  src8, src9);
666        src10 = __lsx_vldx(src0_ptr, src_stride_3x);
667        src0_ptr += src_stride_4x;
668        in0 = __lsx_vld(src1_ptr, 0);
669        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
670                  in1, in2);
671        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
672        src1_ptr += src2_stride_2x;
673        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
674                  src76_r, src87_r, src98_r, src109_r);
675
676        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
677                  filt0, src43_r, filt0, dst0_r, dst1_r, dst2_r, dst3_r);
678        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
679                  filt1, dst2_r, src54_r, filt1, dst3_r, src65_r, filt1,
680                  dst0_r, dst1_r, dst2_r, dst3_r);
681        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, src65_r,
682                  filt2, dst2_r, src76_r, filt2, dst3_r, src87_r, filt2,
683                  dst0_r, dst1_r, dst2_r, dst3_r);
684        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, src87_r,
685                  filt3, dst2_r, src98_r, filt3, dst3_r, src109_r, filt3,
686                  dst0_r, dst1_r, dst2_r, dst3_r);
687
688        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
689        dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
690        __lsx_vstelm_d(dst0_r, dst, 0, 0);
691        __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
692        __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
693        __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
694        dst += dst_stride_4x;
695
696        src10_r = src54_r;
697        src32_r = src76_r;
698        src54_r = src98_r;
699        src21_r = src65_r;
700        src43_r = src87_r;
701        src65_r = src109_r;
702
703        src6 = src10;
704    }
705}
706
707static av_always_inline
708void hevc_vt_8t_16multx2mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
709                                 int16_t *src1_ptr, int32_t src2_stride,
710                                 uint8_t *dst, int32_t dst_stride,
711                                 const int8_t *filter, int32_t height,
712                                 int32_t width)
713{
714    uint8_t *src0_ptr_tmp;
715    int16_t *src1_ptr_tmp;
716    uint8_t *dst_tmp;
717    uint32_t loop_cnt;
718    uint32_t cnt;
719    int32_t src_stride_2x = (src_stride << 1);
720    int32_t dst_stride_2x = (dst_stride << 1);
721    int32_t src_stride_4x = (src_stride << 2);
722    int32_t src_stride_3x = src_stride_2x + src_stride;
723    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
724    __m128i in0, in1, in2, in3;
725    __m128i src10_r, src32_r, src54_r, src76_r;
726    __m128i src21_r, src43_r, src65_r, src87_r;
727    __m128i dst0_r, dst1_r;
728    __m128i src10_l, src32_l, src54_l, src76_l;
729    __m128i src21_l, src43_l, src65_l, src87_l;
730    __m128i dst0_l, dst1_l;
731    __m128i filt0, filt1, filt2, filt3;
732
733    src0_ptr -= src_stride_3x;
734
735    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
736              filt0, filt1, filt2, filt3);
737
738    for (cnt = (width >> 4); cnt--;) {
739        src0_ptr_tmp = src0_ptr;
740        src1_ptr_tmp = src1_ptr;
741        dst_tmp = dst;
742
743        src0 = __lsx_vld(src0_ptr_tmp, 0);
744        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
745                  src_stride_2x, src1, src2);
746        src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
747        src0_ptr_tmp += src_stride_4x;
748        src4 = __lsx_vld(src0_ptr_tmp, 0);
749        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
750                  src_stride_2x, src5, src6);
751        src0_ptr_tmp += src_stride_3x;
752
753        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
754                  src10_r, src32_r, src54_r, src21_r);
755        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
756        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
757                  src10_l, src32_l, src54_l, src21_l);
758        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
759
760        for (loop_cnt = (height >> 1); loop_cnt--;) {
761            src7 = __lsx_vld(src0_ptr_tmp, 0);
762            src8 = __lsx_vldx(src0_ptr_tmp, src_stride);
763            src0_ptr_tmp += src_stride_2x;
764            DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in0, in2);
765            src1_ptr_tmp += src2_stride;
766            DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in1, in3);
767            src1_ptr_tmp += src2_stride;
768
769            DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
770            DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
771
772            DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
773                      filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
774            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
775                      src43_r, filt1, dst0_l, src32_l, filt1, dst1_l, src43_l,
776                      filt1, dst0_r, dst1_r, dst0_l, dst1_l);
777            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r,
778                      src65_r, filt2, dst0_l, src54_l, filt2, dst1_l, src65_l,
779                      filt2, dst0_r, dst1_r, dst0_l, dst1_l);
780            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r,
781                      src87_r, filt3, dst0_l, src76_l, filt3, dst1_l, src87_l,
782                      filt3, dst0_r, dst1_r, dst0_l, dst1_l);
783            dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
784            dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
785
786            __lsx_vst(dst0_r, dst_tmp, 0);
787            __lsx_vstx(dst1_r, dst_tmp, dst_stride);
788            dst_tmp += dst_stride_2x;
789
790            src10_r = src32_r;
791            src32_r = src54_r;
792            src54_r = src76_r;
793            src21_r = src43_r;
794            src43_r = src65_r;
795            src65_r = src87_r;
796            src10_l = src32_l;
797            src32_l = src54_l;
798            src54_l = src76_l;
799            src21_l = src43_l;
800            src43_l = src65_l;
801            src65_l = src87_l;
802            src6 = src8;
803        }
804
805        src0_ptr += 16;
806        src1_ptr += 16;
807        dst += 16;
808    }
809}
810
811static void hevc_vt_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
812                               int16_t *src1_ptr, int32_t src2_stride,
813                               uint8_t *dst, int32_t dst_stride,
814                               const int8_t *filter, int32_t height)
815{
816    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
817                                dst, dst_stride, filter, height, 16);
818}
819
820static void hevc_vt_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
821                               int16_t *src1_ptr, int32_t src2_stride,
822                               uint8_t *dst, int32_t dst_stride,
823                               const int8_t *filter, int32_t height)
824{
825    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
826                                dst, dst_stride, filter, height, 16);
827    hevc_vt_8t_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
828                      dst + 16, dst_stride, filter, height);
829}
830
831static void hevc_vt_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
832                               int16_t *src1_ptr, int32_t src2_stride,
833                               uint8_t *dst, int32_t dst_stride,
834                               const int8_t *filter, int32_t height)
835{
836    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
837                                dst, dst_stride, filter, height, 32);
838}
839
840static void hevc_vt_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
841                               int16_t *src1_ptr, int32_t src2_stride,
842                               uint8_t *dst, int32_t dst_stride,
843                               const int8_t *filter, int32_t height)
844{
845    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
846                                dst, dst_stride, filter, height, 48);
847}
848
849static void hevc_vt_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
850                               int16_t *src1_ptr, int32_t src2_stride,
851                               uint8_t *dst, int32_t dst_stride,
852                               const int8_t *filter, int32_t height)
853{
854    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
855                                dst, dst_stride, filter, height, 64);
856}
857
858static av_always_inline
859void hevc_hv_8t_8multx1mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
860                                int16_t *src1_ptr, int32_t src2_stride,
861                                uint8_t *dst, int32_t dst_stride,
862                                const int8_t *filter_x, const int8_t *filter_y,
863                                int32_t height, int32_t width)
864{
865    uint32_t loop_cnt;
866    uint32_t cnt;
867    uint8_t *src0_ptr_tmp;
868    int16_t *src1_ptr_tmp;
869    uint8_t *dst_tmp;
870    int32_t src_stride_2x = (src_stride << 1);
871    int32_t src_stride_4x = (src_stride << 2);
872    int32_t src_stride_3x = src_stride_2x + src_stride;
873    __m128i out;
874    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
875    __m128i in0, tmp;
876    __m128i filt0, filt1, filt2, filt3;
877    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
878    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
879    __m128i mask1, mask2, mask3;
880    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
881    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
882    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
883    __m128i dst0_r, dst0_l;
884    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
885    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
886
887    src0_ptr -= src_stride_3x + 3;
888
889    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, filter_x,
890              6, filt0, filt1, filt2, filt3);
891    filt_h3 = __lsx_vld(filter_y, 0);
892    filt_h3 = __lsx_vsllwil_h_b(filt_h3, 0);
893
894    DUP4_ARG2(__lsx_vreplvei_w, filt_h3, 0, filt_h3, 1, filt_h3, 2, filt_h3, 3,
895              filt_h0, filt_h1, filt_h2, filt_h3);
896
897    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
898    mask3 = __lsx_vaddi_bu(mask0, 6);
899
900    for (cnt = width >> 3; cnt--;) {
901        src0_ptr_tmp = src0_ptr;
902        dst_tmp = dst;
903        src1_ptr_tmp = src1_ptr;
904
905        src0 = __lsx_vld(src0_ptr_tmp, 0);
906        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
907                  src_stride_2x, src1, src2);
908        src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
909        src0_ptr_tmp += src_stride_4x;
910        src4 = __lsx_vld(src0_ptr_tmp, 0);
911        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
912                  src_stride_2x, src5, src6);
913        src0_ptr_tmp += src_stride_3x;
914
915        /* row 0 row 1 row 2 row 3 */
916        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
917                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
918        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
919                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
920        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
921                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
922        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
923                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
924        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
925                  vec12, filt0, dst0, dst1, dst2, dst3);
926        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
927                  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
928        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
929                  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
930        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
931                  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
932
933        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
934                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
935        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
936                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
937        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
938                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
939        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
940        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
941        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
942                  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
943        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
944                  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
945        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
946
947        for (loop_cnt = height; loop_cnt--;) {
948            src7 = __lsx_vld(src0_ptr_tmp, 0);
949            src0_ptr_tmp += src_stride;
950
951            in0 = __lsx_vld(src1_ptr_tmp, 0);
952            src1_ptr_tmp += src2_stride;
953
954            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
955                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
956            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
957            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
958                      filt2, dst7, dst7);
959            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
960            DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
961                      dst6, dst10_r, dst32_r, dst54_r, dst76_r);
962            DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
963                      dst6, dst10_l, dst32_l, dst54_l, dst76_l);
964
965            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
966                      dst0_r, dst0_l);
967            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
968                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
969                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
970            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
971                      dst76_l, filt_h3, dst0_r, dst0_l);
972            dst0_r = __lsx_vsrli_w(dst0_r, 6);
973            dst0_l = __lsx_vsrli_w(dst0_l, 6);
974
975            tmp = __lsx_vpickev_h(dst0_l, dst0_r);
976            tmp = __lsx_vsadd_h(tmp, in0);
977            tmp = __lsx_vmaxi_h(tmp, 0);
978            out = __lsx_vssrlrni_bu_h(tmp, tmp, 7);
979            __lsx_vstelm_d(out, dst_tmp, 0, 0);
980            dst_tmp += dst_stride;
981
982            dst0 = dst1;
983            dst1 = dst2;
984            dst2 = dst3;
985            dst3 = dst4;
986            dst4 = dst5;
987            dst5 = dst6;
988            dst6 = dst7;
989        }
990
991        src0_ptr += 8;
992        dst += 8;
993        src1_ptr += 8;
994    }
995}
996
997static void hevc_hv_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
998                              int16_t *src1_ptr, int32_t src2_stride,
999                              uint8_t *dst, int32_t dst_stride,
1000                              const int8_t *filter_x, const int8_t *filter_y,
1001                              int32_t height)
1002{
1003    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1004                               dst, dst_stride, filter_x, filter_y, height, 8);
1005}
1006
1007static void hevc_hv_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1008                               int16_t *src1_ptr, int32_t src2_stride,
1009                               uint8_t *dst, int32_t dst_stride,
1010                               const int8_t *filter_x, const int8_t *filter_y,
1011                               int32_t height)
1012{
1013    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1014                               dst, dst_stride, filter_x, filter_y, height, 16);
1015}
1016
1017static void hevc_hv_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1018                               int16_t *src1_ptr, int32_t src2_stride,
1019                               uint8_t *dst, int32_t dst_stride,
1020                               const int8_t *filter_x, const int8_t *filter_y,
1021                               int32_t height)
1022{
1023    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1024                               dst, dst_stride, filter_x, filter_y, height, 24);
1025}
1026
1027static void hevc_hv_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1028                               int16_t *src1_ptr, int32_t src2_stride,
1029                               uint8_t *dst, int32_t dst_stride,
1030                               const int8_t *filter_x, const int8_t *filter_y,
1031                               int32_t height)
1032{
1033    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1034                               dst, dst_stride, filter_x, filter_y, height, 32);
1035}
1036
1037static void hevc_hv_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1038                               int16_t *src1_ptr, int32_t src2_stride,
1039                               uint8_t *dst, int32_t dst_stride,
1040                               const int8_t *filter_x, const int8_t *filter_y,
1041                               int32_t height)
1042{
1043    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1044                               dst, dst_stride, filter_x, filter_y, height, 48);
1045}
1046
1047static void hevc_hv_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1048                               int16_t *src1_ptr, int32_t src2_stride,
1049                               uint8_t *dst, int32_t dst_stride,
1050                               const int8_t *filter_x, const int8_t *filter_y,
1051                               int32_t height)
1052{
1053    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1054                               dst, dst_stride, filter_x, filter_y, height, 64);
1055}
1056
1057static void hevc_hz_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1058                               int16_t *src1_ptr, int32_t src2_stride,
1059                               uint8_t *dst, int32_t dst_stride,
1060                               const int8_t *filter, int32_t height)
1061{
1062    int16_t *src1_ptr_tmp;
1063    uint8_t *dst_tmp;
1064    uint32_t loop_cnt;
1065    int32_t dst_stride_2x = (dst_stride << 1);
1066    int32_t dst_stride_4x = (dst_stride << 2);
1067    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1068    int32_t src2_stride_x = src2_stride << 1;
1069    int32_t src2_stride_2x = src2_stride << 2;
1070    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1071
1072    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1073    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1074    __m128i filt0, filt1;
1075    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1076    __m128i mask1, mask2, mask3;
1077    __m128i vec0, vec1, vec2, vec3;
1078    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1079
1080    src0_ptr -= 1;
1081    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1082
1083    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
1084    mask3 = __lsx_vaddi_bu(mask0, 10);
1085
1086    dst_tmp = dst + 16;
1087    src1_ptr_tmp = src1_ptr + 16;
1088
1089    for (loop_cnt = (height >> 2); loop_cnt--;) {
1090        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
1091        src0_ptr += src_stride;
1092        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src3);
1093        src0_ptr += src_stride;
1094        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src5);
1095        src0_ptr += src_stride;
1096        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src6, src7);
1097        src0_ptr += src_stride;
1098
1099        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
1100        src1_ptr += src2_stride;
1101        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
1102        src1_ptr += src2_stride;
1103        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in4, in5);
1104        src1_ptr += src2_stride;
1105        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in6, in7);
1106        src1_ptr += src2_stride;
1107
1108        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src2,
1109                  src2, mask0, src3, src2, mask2, vec0, vec1, vec2, vec3);
1110        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1111                  vec3, filt0, dst0, dst1, dst2, dst3);
1112        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src2,
1113                  src2, mask1, src3, src2, mask3, vec0, vec1, vec2, vec3);
1114        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1115                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1116
1117        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src4, mask2, src6,
1118                  src6, mask0, src7, src6, mask2, vec0, vec1, vec2, vec3);
1119        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1120                  vec3, filt0, dst4, dst5, dst6, dst7);
1121        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src4, mask3, src6,
1122                  src6, mask1, src7, src6, mask3, vec0, vec1, vec2, vec3);
1123        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec0, filt1, dst5, vec1, filt1,
1124                  dst6, vec2, filt1, dst7, vec3, filt1, dst4, dst5, dst6, dst7);
1125
1126        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
1127        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
1128        dst2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
1129        dst3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
1130        __lsx_vst(dst0, dst, 0);
1131        __lsx_vstx(dst1, dst, dst_stride);
1132        __lsx_vstx(dst2, dst, dst_stride_2x);
1133        __lsx_vstx(dst3, dst, dst_stride_3x);
1134        dst += dst_stride_4x;
1135
1136        in0 = __lsx_vld(src1_ptr_tmp, 0);
1137        DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
1138                  src2_stride_2x, in1, in2);
1139        in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
1140        src1_ptr_tmp += src2_stride_2x;
1141
1142        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src3, src3, mask0, src5,
1143                  src5, mask0, src7, src7, mask0, vec0, vec1, vec2, vec3);
1144        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1145                  vec3, filt0, dst0, dst1, dst2, dst3);
1146        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask1, src3, src3, mask1, src5,
1147                  src5, mask1, src7, src7, mask1, vec0, vec1, vec2, vec3);
1148        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1149                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1150        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
1151        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
1152        __lsx_vstelm_d(dst0, dst_tmp, 0, 0);
1153        __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1);
1154        __lsx_vstelm_d(dst1, dst_tmp + dst_stride_2x, 0, 0);
1155        __lsx_vstelm_d(dst1, dst_tmp + dst_stride_3x, 0, 1);
1156        dst_tmp += dst_stride_4x;
1157    }
1158}
1159
1160static void hevc_hz_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1161                               int16_t *src1_ptr, int32_t src2_stride,
1162                               uint8_t *dst, int32_t dst_stride,
1163                               const int8_t *filter, int32_t height)
1164{
1165    uint32_t loop_cnt;
1166    __m128i src0, src1, src2;
1167    __m128i in0, in1, in2, in3;
1168    __m128i filt0, filt1;
1169    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1170    __m128i mask1, mask2, mask3;
1171    __m128i dst0, dst1, dst2, dst3;
1172    __m128i vec0, vec1, vec2, vec3;
1173
1174    src0_ptr -= 1;
1175
1176    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1177
1178    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
1179    mask3 = __lsx_vaddi_bu(mask0, 10);
1180
1181    for (loop_cnt = height; loop_cnt--;) {
1182        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
1183        src2 = __lsx_vld(src0_ptr, 24);
1184        src0_ptr += src_stride;
1185        DUP4_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, src1_ptr, 32,
1186                  src1_ptr, 48, in0, in1, in2, in3);
1187        src1_ptr += src2_stride;
1188        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src1,
1189                  src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
1190        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1191                  vec3, filt0, dst0, dst1, dst2, dst3);
1192        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src1,
1193                  src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
1194        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1195                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1196        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
1197        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
1198        __lsx_vst(dst0, dst, 0);
1199        __lsx_vst(dst1, dst, 16);
1200        dst += dst_stride;
1201    }
1202}
1203
1204static void hevc_vt_4t_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1205                               int16_t *src1_ptr, int32_t src2_stride,
1206                               uint8_t *dst, int32_t dst_stride,
1207                               const int8_t *filter, int32_t height)
1208{
1209    int32_t loop_cnt;
1210    int32_t src_stride_2x = (src_stride << 1);
1211    int32_t dst_stride_2x = (dst_stride << 1);
1212    int32_t dst_stride_4x = (dst_stride << 2);
1213    int32_t src_stride_4x = (src_stride << 2);
1214    int32_t src2_stride_x = (src2_stride << 1);
1215    int32_t src2_stride_2x = (src2_stride << 2);
1216    int32_t src_stride_3x = src_stride_2x + src_stride;
1217    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1218    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1219    int16_t *_src1 = src1_ptr + 8;
1220    __m128i src0, src1, src2, src3, src4, src5, src6;
1221    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1222    __m128i src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
1223    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1224    __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
1225    __m128i src2110, src4332, src6554;
1226    __m128i dst0_l, dst1_l, filt0, filt1;
1227
1228    src0_ptr -= src_stride;
1229    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1230
1231    src0 = __lsx_vld(src0_ptr, 0);
1232    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1233              src1, src2);
1234    src0_ptr += src_stride_3x;
1235    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1236    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
1237    src2110 = __lsx_vilvl_d(src21_l, src10_l);
1238
1239    for (loop_cnt = (height >> 2); loop_cnt--;) {
1240        src3 = __lsx_vld(src0_ptr, 0);
1241        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1242                  src4, src5);
1243        src6 = __lsx_vldx(src0_ptr, src_stride_3x);
1244        src0_ptr += src_stride_4x;
1245        in0 = __lsx_vld(src1_ptr, 0);
1246        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
1247                  src2_stride_2x, in1, in2);
1248        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1249        src1_ptr += src2_stride_2x;
1250        in4 = __lsx_vld(_src1, 0);
1251        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
1252                  in5, in6);
1253        in7 = __lsx_vldx(_src1, src2_stride_3x);
1254        _src1 += src2_stride_2x;
1255        DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
1256
1257        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
1258        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
1259        src4332 = __lsx_vilvl_d(src43_l, src32_l);
1260        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src54_r, src65_r);
1261        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src54_l, src65_l);
1262        src6554 = __lsx_vilvl_d(src65_l, src54_l);
1263
1264        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src2110,
1265                  filt0, src32_r, filt0, dst0_r, dst1_r, dst0_l, dst2_r);
1266        DUP2_ARG2(__lsx_vdp2_h_bu_b, src43_r, filt0, src4332, filt0,
1267                  dst3_r, dst1_l);
1268        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
1269                  src43_r, filt1, dst0_l, src4332, filt1, dst2_r, src54_r,
1270                  filt1, dst0_r, dst1_r, dst0_l, dst2_r);
1271        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst1_l,
1272                  src6554, filt1, dst3_r, dst1_l);
1273        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
1274        dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
1275        dst0_l = hevc_bi_rnd_clip(in4, dst0_l, in5, dst1_l);
1276        __lsx_vstelm_d(dst0_r, dst, 0, 0);
1277        __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
1278        __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
1279        __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
1280        __lsx_vstelm_w(dst0_l, dst, 8, 0);
1281        __lsx_vstelm_w(dst0_l, dst + dst_stride, 8, 1);
1282        __lsx_vstelm_w(dst0_l, dst + dst_stride_2x, 8, 2);
1283        __lsx_vstelm_w(dst0_l, dst + dst_stride_3x, 8, 3);
1284        dst += dst_stride_4x;
1285
1286        src2 = src6;
1287        src10_r = src54_r;
1288        src21_r = src65_r;
1289        src2110 = src6554;
1290    }
1291}
1292
1293static void hevc_vt_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1294                               int16_t *src1_ptr, int32_t src2_stride,
1295                               uint8_t *dst, int32_t dst_stride,
1296                               const int8_t *filter, int32_t height)
1297{
1298    int32_t loop_cnt;
1299    const int32_t src_stride_2x = (src_stride << 1);
1300    const int32_t dst_stride_2x = (dst_stride << 1);
1301    const int32_t src_stride_3x = src_stride_2x + src_stride;
1302    __m128i src0, src1, src2, src3, src4, src5;
1303    __m128i in0, in1, in2, in3;
1304    __m128i src10_r, src32_r, src21_r, src43_r;
1305    __m128i src10_l, src32_l, src21_l, src43_l;
1306    __m128i dst0_r, dst1_r, dst0_l, dst1_l;
1307    __m128i filt0, filt1;
1308
1309    src0_ptr -= src_stride;
1310    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1311
1312    src0 = __lsx_vld(src0_ptr, 0);
1313    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1314              src1, src2);
1315    src0_ptr += src_stride_3x;
1316    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1317    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
1318
1319    for (loop_cnt = (height >> 2); loop_cnt--;) {
1320        src3 = __lsx_vld(src0_ptr, 0);
1321        src4 = __lsx_vldx(src0_ptr, src_stride);
1322        src0_ptr += src_stride_2x;
1323        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1324        src1_ptr += src2_stride;
1325        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1326        src1_ptr += src2_stride;
1327        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
1328        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
1329
1330        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
1331                  filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
1332        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
1333                  filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, filt1,
1334                  dst0_r, dst1_r, dst0_l, dst1_l);
1335
1336        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
1337        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
1338        __lsx_vst(dst0_r, dst, 0);
1339        __lsx_vstx(dst1_r, dst, dst_stride);
1340        dst += dst_stride_2x;
1341
1342        src5 = __lsx_vld(src0_ptr, 0);
1343        src2 = __lsx_vldx(src0_ptr, src_stride);
1344        src0_ptr += src_stride_2x;
1345        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1346        src1_ptr += src2_stride;
1347        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1348        src1_ptr += src2_stride;
1349        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
1350        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
1351
1352        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
1353                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1354        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
1355                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
1356                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
1357        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
1358        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
1359        __lsx_vst(dst0_r, dst, 0);
1360        __lsx_vstx(dst1_r, dst, dst_stride);
1361        dst += dst_stride_2x;
1362    }
1363}
1364
1365static void hevc_vt_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1366                               int16_t *src1_ptr, int32_t src2_stride,
1367                               uint8_t *dst, int32_t dst_stride,
1368                               const int8_t *filter, int32_t height)
1369{
1370    uint32_t loop_cnt;
1371    int32_t dst_stride_2x = dst_stride << 1;
1372    __m128i src0, src1, src2, src3, src4, src5;
1373    __m128i src6, src7, src8, src9, src10, src11;
1374    __m128i in0, in1, in2, in3, in4, in5;
1375    __m128i src10_r, src32_r, src76_r, src98_r;
1376    __m128i src21_r, src43_r, src87_r, src109_r;
1377    __m128i src10_l, src32_l, src21_l, src43_l;
1378    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1379    __m128i dst0_l, dst1_l;
1380    __m128i filt0, filt1;
1381
1382    src0_ptr -= src_stride;
1383    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1384
1385    /* 16width */
1386    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src6);
1387    src0_ptr += src_stride;
1388    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src1, src7);
1389    src0_ptr += src_stride;
1390    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8);
1391    src0_ptr += src_stride;
1392    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1393    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
1394    /* 8width */
1395    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
1396
1397    for (loop_cnt = (height >> 2); loop_cnt--;) {
1398        /* 16width */
1399        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src3, src9);
1400        src0_ptr += src_stride;
1401        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src10);
1402        src0_ptr += src_stride;
1403        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1404        in4 = __lsx_vld(src1_ptr, 32);
1405        src1_ptr += src2_stride;
1406        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1407        in5 = __lsx_vld(src1_ptr, 32);
1408        src1_ptr += src2_stride;
1409        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
1410        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
1411        /* 8width */
1412        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
1413        /* 16width */
1414        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
1415                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1416        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1,  dst0_l,
1417                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, filt1,
1418                  dst0_r, dst0_l, dst1_r, dst1_l);
1419        /* 8width */
1420        DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
1421                  dst2_r, dst3_r);
1422        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
1423                  src109_r, filt1, dst2_r, dst3_r);
1424        /* 16width */
1425        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
1426        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
1427        dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
1428        __lsx_vst(dst0_r, dst, 0);
1429        __lsx_vstx(dst1_r, dst, dst_stride);
1430        __lsx_vstelm_d(dst2_r, dst, 16, 0);
1431        __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
1432        dst += dst_stride_2x;
1433
1434        /* 16width */
1435        DUP4_ARG2(__lsx_vld, src0_ptr, 0, src1_ptr, 0, src1_ptr, 16, src1_ptr,
1436                  32, src5, in0, in2, in4);
1437        src1_ptr += src2_stride;
1438        DUP4_ARG2(__lsx_vld, src0_ptr, 16,  src1_ptr, 0, src1_ptr, 16, src1_ptr,
1439                  32, src11, in1, in3, in5);
1440        src1_ptr += src2_stride;
1441        src0_ptr += src_stride;
1442        DUP2_ARG2(__lsx_vld, src0_ptr, 0,  src0_ptr, 16, src2, src8);
1443        src0_ptr += src_stride;
1444        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
1445        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
1446        /* 8width */
1447        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
1448        /* 16width */
1449        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
1450                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1451        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
1452                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
1453                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
1454
1455        /* 8width */
1456        DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
1457                  dst2_r, dst3_r);
1458        DUP2_ARG3(__lsx_vdp2add_h_bu_b,  dst2_r, src76_r, filt1, dst3_r,
1459                  src87_r, filt1, dst2_r, dst3_r);
1460
1461        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
1462        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
1463        dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
1464        __lsx_vst(dst0_r, dst, 0);
1465        __lsx_vstx(dst1_r, dst, dst_stride);
1466        __lsx_vstelm_d(dst2_r, dst, 16, 0);
1467        __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
1468        dst += dst_stride_2x;
1469    }
1470}
1471
1472static void hevc_vt_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1473                               int16_t *src1_ptr, int32_t src2_stride,
1474                               uint8_t *dst, int32_t dst_stride,
1475                               const int8_t *filter, int32_t height)
1476{
1477    hevc_vt_4t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1478                       dst, dst_stride, filter, height);
1479    hevc_vt_4t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1480                       dst + 16, dst_stride, filter, height);
1481}
1482
1483static void hevc_hv_4t_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1484                              int16_t *src1_ptr, int32_t src2_stride,
1485                              uint8_t *dst, int32_t dst_stride,
1486                              const int8_t *filter_x, const int8_t *filter_y,
1487                              int32_t height)
1488{
1489    int32_t src_stride_2x = (src_stride << 1);
1490    int32_t dst_stride_2x = (dst_stride << 1);
1491    int32_t src_stride_4x = (src_stride << 2);
1492    int32_t dst_stride_4x = (dst_stride << 2);
1493    int32_t src2_stride_2x = (src2_stride << 1);
1494    int32_t src2_stride_4x = (src2_stride << 2);
1495    int32_t src_stride_3x = src_stride_2x + src_stride;
1496    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1497    int32_t src2_stride_3x = src2_stride_2x + src2_stride;
1498    __m128i out0, out1;
1499    __m128i src0, src1, src2, src3, src4, src5, src6;
1500    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, mask1;
1501    __m128i filt0, filt1, filt_h0, filt_h1;
1502    __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5;
1503    __m128i dsth6, dsth7, dsth8, dsth9, dsth10;
1504    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1505    __m128i dst4_r, dst5_r, dst6_r, dst7_r;
1506    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
1507    __m128i reg0, reg1, reg2, reg3;
1508    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1509
1510    src0_ptr -= (src_stride + 1);
1511    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1512
1513    filt_h1 = __lsx_vld(filter_y, 0);
1514    filt_h1 = __lsx_vsllwil_h_b(filt_h1, 0);
1515    DUP2_ARG2(__lsx_vreplvei_w, filt_h1, 0, filt_h1, 1, filt_h0, filt_h1);
1516
1517    mask1 = __lsx_vaddi_bu(mask0, 2);
1518
1519    src0 = __lsx_vld(src0_ptr, 0);
1520    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1521              src1, src2);
1522    src0_ptr += src_stride_3x;
1523
1524    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1525    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1526    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1527
1528    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
1529    dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1530    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
1531              dsth0, dsth1);
1532    dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
1533
1534    DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, tmp0, tmp2);
1535    DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, tmp1, tmp3);
1536
1537    src3 = __lsx_vld(src0_ptr, 0);
1538    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1539              src4, src5);
1540    src6 = __lsx_vldx(src0_ptr, src_stride_3x);
1541    src0_ptr += src_stride_4x;
1542    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
1543    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
1544    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
1545    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
1546
1547    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1548              filt0, dsth3, dsth4, dsth5, dsth6);
1549    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, vec3, filt1, dsth5,
1550              vec5, filt1, dsth6, vec7, filt1, dsth3, dsth4, dsth5, dsth6);
1551
1552    src3 = __lsx_vld(src0_ptr, 0);
1553    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1554              src4, src5);
1555    src6 = __lsx_vldx(src0_ptr, src_stride_3x);
1556
1557    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
1558    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
1559    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
1560    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
1561
1562    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1563              filt0, dsth7, dsth8, dsth9, dsth10);
1564    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth7, vec1, filt1, dsth8, vec3, filt1, dsth9,
1565              vec5, filt1, dsth10, vec7, filt1, dsth7, dsth8, dsth9, dsth10);
1566
1567    DUP2_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, tmp4, tmp6);
1568    DUP2_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, tmp5, tmp7);
1569    DUP2_ARG2(__lsx_vilvl_h, dsth5, dsth4, dsth6, dsth5, dsth0, dsth2);
1570    DUP2_ARG2(__lsx_vilvh_h, dsth5, dsth4, dsth6, dsth5, dsth1, dsth3);
1571    DUP4_ARG2(__lsx_vdp2_w_h, tmp0, filt_h0, tmp2, filt_h0, tmp4, filt_h0,
1572              tmp6, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
1573    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, tmp4, filt_h1, dst1_r, tmp6,
1574              filt_h1, dst2_r, dsth0, filt_h1, dst3_r, dsth2, filt_h1,
1575              dst0_r, dst1_r, dst2_r, dst3_r);
1576    DUP2_ARG2(__lsx_vpickev_d, tmp3, tmp1, tmp7, tmp5, tmp0, tmp8);
1577    dst0_l = __lsx_vdp2_w_h(tmp0, filt_h0);
1578    dst0_l = __lsx_vdp2add_w_h(dst0_l, tmp8, filt_h1);
1579
1580    DUP2_ARG2(__lsx_vilvl_h, dsth7, dsth6, dsth8, dsth7, tmp0, tmp2);
1581    DUP2_ARG2(__lsx_vilvh_h, dsth7, dsth6, dsth8, dsth7, tmp1, tmp3);
1582    DUP2_ARG2(__lsx_vilvl_h, dsth9, dsth8, dsth10, dsth9, tmp4, tmp6);
1583    DUP2_ARG2(__lsx_vilvh_h, dsth9, dsth8, dsth10, dsth9, tmp5, tmp7);
1584    DUP4_ARG2(__lsx_vdp2_w_h, dsth0, filt_h0, dsth2, filt_h0, tmp0, filt_h0,
1585              tmp2, filt_h0, dst4_r, dst5_r, dst6_r, dst7_r);
1586    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, tmp0, filt_h1, dst5_r, tmp2,
1587              filt_h1, dst6_r, tmp4, filt_h1, dst7_r, tmp6, filt_h1,
1588              dst4_r, dst5_r, dst6_r, dst7_r);
1589    DUP2_ARG2(__lsx_vpickev_d, dsth3, dsth1, tmp3, tmp1, tmp0, tmp1);
1590    tmp2 = __lsx_vpickev_d(tmp7, tmp5);
1591
1592    DUP2_ARG2(__lsx_vdp2_w_h, tmp8, filt_h0, tmp0, filt_h0, dst1_l, dst2_l);
1593    dst3_l = __lsx_vdp2_w_h(tmp1, filt_h0);
1594    DUP2_ARG3(__lsx_vdp2add_w_h, dst1_l, tmp0, filt_h1, dst2_l, tmp1, filt_h1,
1595              dst1_l, dst2_l);
1596    dst3_l = __lsx_vdp2add_w_h(dst3_l, tmp2, filt_h1);
1597
1598    DUP4_ARG2(__lsx_vsrai_d, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
1599              dst0_r, dst1_r, dst2_r, dst3_r);
1600    DUP4_ARG2(__lsx_vsrai_d, dst4_r, 6, dst5_r, 6, dst6_r, 6, dst7_r, 6,
1601              dst4_r, dst5_r, dst6_r, dst7_r);
1602    DUP4_ARG2(__lsx_vsrai_d, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
1603              dst0_l, dst1_l, dst2_l, dst3_l);
1604    DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
1605    DUP2_ARG2(__lsx_vpickev_h, dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
1606    DUP2_ARG2(__lsx_vpickev_h, dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
1607
1608    reg0 = __lsx_vldrepl_d(src1_ptr, 0);
1609    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
1610    dsth0 = __lsx_vilvl_d(reg1, reg0);
1611    reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
1612    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
1613    dsth1 = __lsx_vilvl_d(reg1, reg0);
1614    src1_ptr += src2_stride_4x;
1615    reg0 = __lsx_vldrepl_d(src1_ptr, 0);
1616    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
1617    dsth2 = __lsx_vilvl_d(reg1, reg0);
1618    reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
1619    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
1620    dsth3 = __lsx_vilvl_d(reg1, reg0);
1621
1622    DUP4_ARG2(__lsx_vsadd_h, dsth0, tmp0, dsth1, tmp1, dsth2, tmp2, dsth3,
1623              tmp3, tmp0, tmp1, tmp2, tmp3);
1624    DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
1625              tmp0, tmp1, tmp2, tmp3);
1626    DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1627
1628    __lsx_vstelm_w(out0, dst, 0, 0);
1629    __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
1630    __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
1631    __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
1632    dst += dst_stride_4x;
1633    __lsx_vstelm_w(out1, dst, 0, 0);
1634    __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
1635    __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
1636    __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
1637    dst -= dst_stride_4x;
1638
1639    src1_ptr -= src2_stride_4x;
1640
1641    reg0 = __lsx_vldrepl_w(src1_ptr, 8);
1642    reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
1643    reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
1644    reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
1645    DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
1646    dsth4 = __lsx_vilvl_d(tmp1, tmp0);
1647    src1_ptr += src2_stride_4x;
1648
1649    reg0 = __lsx_vldrepl_w(src1_ptr, 8);
1650    reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
1651    reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
1652    reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
1653    DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
1654    dsth5 = __lsx_vilvl_d(tmp1, tmp0);
1655    DUP2_ARG2(__lsx_vsadd_h, dsth4, tmp4, dsth5, tmp5, tmp4, tmp5);
1656    DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 7, tmp4, tmp5);
1657    out0 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
1658
1659    __lsx_vstelm_h(out0, dst, 4, 0);
1660    __lsx_vstelm_h(out0, dst + dst_stride, 4, 1);
1661    __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 2);
1662    __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 3);
1663    dst += dst_stride_4x;
1664    __lsx_vstelm_h(out0, dst, 4, 4);
1665    __lsx_vstelm_h(out0, dst + dst_stride, 4, 5);
1666    __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 6);
1667    __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 7);
1668}
1669
1670static av_always_inline
1671void hevc_hv_4t_8x2_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
1672                        int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
1673                        const int8_t *filter_x, const int8_t *filter_y)
1674{
1675    int32_t src_stride_2x = (src_stride << 1);
1676    int32_t src_stride_4x = (src_stride << 2);
1677    int32_t src_stride_3x = src_stride_2x + src_stride;
1678
1679    __m128i out;
1680    __m128i src0, src1, src2, src3, src4;
1681    __m128i filt0, filt1;
1682    __m128i filt_h0, filt_h1;
1683    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1684    __m128i mask1, filter_vec;
1685    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1686    __m128i dst0, dst1, dst2, dst3, dst4;
1687    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
1688    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
1689    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
1690    __m128i tmp0, tmp1;
1691    __m128i in0, in1;
1692
1693    src0_ptr -= (src_stride + 1);
1694    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1695
1696    filter_vec = __lsx_vld(filter_y, 0);
1697    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1698    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1699
1700    mask1 = __lsx_vaddi_bu(mask0, 2);
1701
1702    src0 = __lsx_vld(src0_ptr, 0);
1703    DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1704              src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
1705              src1, src2, src3, src4);
1706
1707    DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr + src2_stride, 0, in0, in1);
1708
1709    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1710    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1711    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1712    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
1713    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
1714
1715    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1716              filt0, dst0, dst1, dst2, dst3);
1717    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
1718    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
1719              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
1720    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
1721
1722    DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1723    DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1724    DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
1725    DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
1726    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1727              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1728    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1729              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1730              dst0_r, dst0_l, dst1_r, dst1_l);
1731    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1732              dst0_r, dst0_l, dst1_r, dst1_l);
1733    DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
1734    DUP2_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, tmp0, tmp1);
1735    DUP2_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp0, tmp1);
1736    out = __lsx_vssrlrni_bu_h(tmp1, tmp0, 7);
1737    __lsx_vstelm_d(out, dst, 0, 0);
1738    __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
1739}
1740
1741static av_always_inline
1742void hevc_hv_4t_8multx4_lsx(uint8_t *src0_ptr, int32_t src_stride,
1743                            int16_t *src1_ptr, int32_t src2_stride,
1744                            uint8_t *dst, int32_t dst_stride,
1745                            const int8_t *filter_x, const int8_t *filter_y,
1746                            int32_t width8mult)
1747{
1748    uint32_t cnt;
1749    int32_t src_stride_2x = (src_stride << 1);
1750    int32_t dst_stride_2x = (dst_stride << 1);
1751    int32_t src_stride_4x = (src_stride << 2);
1752    int32_t src2_stride_x = (src2_stride << 1);
1753    int32_t src2_stride_2x = (src2_stride << 2);
1754    int32_t src_stride_3x = src_stride_2x + src_stride;
1755    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1756    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1757
1758    __m128i out0, out1;
1759    __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
1760    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1761    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
1762    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
1763    __m128i in0, in1, in2, in3;
1764    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1765    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
1766    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
1767
1768    src0_ptr -= (src_stride + 1);
1769    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1770
1771    filter_vec = __lsx_vld(filter_y, 0);
1772    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1773    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1774
1775    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1776    mask1 = __lsx_vaddi_bu(mask0, 2);
1777
1778    for (cnt = width8mult; cnt--;) {
1779        src0 = __lsx_vld(src0_ptr, 0);
1780        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1781                  src1, src2);
1782        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
1783        src0_ptr += src_stride_4x;
1784        src4 = __lsx_vld(src0_ptr, 0);
1785        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1786                  src5, src6);
1787        src0_ptr += (8 - src_stride_4x);
1788
1789        in0 = __lsx_vld(src1_ptr, 0);
1790        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
1791                  src2_stride_2x, in1, in2);
1792        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1793        src1_ptr += 8;
1794
1795        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
1796                  vec0, vec1);
1797        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
1798                  vec2, vec3);
1799        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
1800                  vec4, vec5);
1801
1802        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
1803        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1804        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
1805                  dst0, dst1);
1806        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
1807
1808        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1809        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1810
1811        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
1812                  vec0, vec1);
1813        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
1814                  vec2, vec3);
1815        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
1816                  vec4, vec5);
1817        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
1818                  vec6, vec7);
1819
1820        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1821                  vec6, filt0, dst3, dst4, dst5, dst6);
1822        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
1823                  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
1824
1825        DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
1826        DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
1827        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
1828        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
1829
1830        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1831                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1832        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1833                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1834        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1835                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1836                  dst0_r, dst0_l, dst1_r, dst1_l);
1837        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1838                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1839                  dst2_r, dst2_l, dst3_r, dst3_l);
1840
1841        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1842                  dst0_r, dst0_l, dst1_r, dst1_l);
1843        DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
1844                  dst2_r, dst2_l, dst3_r, dst3_l);
1845        DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
1846                  dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
1847        DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
1848                  tmp0, tmp1, tmp2, tmp3);
1849        DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
1850                  tmp0, tmp1, tmp2, tmp3);
1851        DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1852        __lsx_vstelm_d(out0, dst, 0, 0);
1853        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1854        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
1855        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
1856        dst += 8;
1857    }
1858}
1859
1860static av_always_inline
1861void hevc_hv_4t_8x6_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
1862                        int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
1863                        const int8_t *filter_x, const int8_t *filter_y)
1864{
1865    int32_t src_stride_2x = (src_stride << 1);
1866    int32_t dst_stride_2x = (dst_stride << 1);
1867    int32_t src_stride_4x = (src_stride << 2);
1868    int32_t dst_stride_4x = (dst_stride << 2);
1869    int32_t src2_stride_x = (src2_stride << 1);
1870    int32_t src2_stride_2x = (src2_stride << 2);
1871    int32_t src_stride_3x = src_stride_2x + src_stride;
1872    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1873    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1874
1875    __m128i out0, out1, out2;
1876    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
1877    __m128i in0, in1, in2, in3, in4, in5;
1878    __m128i filt0, filt1;
1879    __m128i filt_h0, filt_h1;
1880    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1881    __m128i mask1, filter_vec;
1882    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1883    __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
1884    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1885    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1886    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1887    __m128i dst4_r, dst4_l, dst5_r, dst5_l;
1888    __m128i dst10_r, dst32_r, dst10_l, dst32_l;
1889    __m128i dst21_r, dst43_r, dst21_l, dst43_l;
1890    __m128i dst54_r, dst54_l, dst65_r, dst65_l;
1891    __m128i dst76_r, dst76_l, dst87_r, dst87_l;
1892
1893    src0_ptr -= (src_stride + 1);
1894    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1895
1896    filter_vec = __lsx_vld(filter_y, 0);
1897    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1898    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1899
1900    mask1 = __lsx_vaddi_bu(mask0, 2);
1901
1902    src0 = __lsx_vld(src0_ptr, 0);
1903    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1904              src1, src2);
1905    src3 = __lsx_vldx(src0_ptr, src_stride_3x);
1906    src0_ptr += src_stride_4x;
1907    src4 = __lsx_vld(src0_ptr, 0);
1908    DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1909              src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
1910              src5, src6, src7, src8);
1911
1912    in0 = __lsx_vld(src1_ptr, 0);
1913    DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
1914              in1, in2);
1915    in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1916    src1_ptr += src2_stride_2x;
1917    in4 = __lsx_vld(src1_ptr, 0);
1918    in5 = __lsx_vldx(src1_ptr, src2_stride_x);
1919
1920    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1921    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1922    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1923    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
1924    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
1925    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec10, vec11);
1926    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec12, vec13);
1927    DUP2_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, vec14, vec15);
1928    DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
1929
1930    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1931              filt0, dst0, dst1, dst2, dst3);
1932    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
1933    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec10, filt0, vec12, filt0, vec14, filt0,
1934              vec16, filt0, dst5, dst6, dst7, dst8);
1935    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
1936              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
1937    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
1938    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec11, filt1, dst6, vec13, filt1,
1939              dst7, vec15, filt1, dst8, vec17, filt1, dst5, dst6, dst7, dst8);
1940
1941    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
1942              dst10_r, dst21_r, dst32_r, dst43_r);
1943    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
1944              dst10_l, dst21_l, dst32_l, dst43_l);
1945    DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
1946              dst54_r, dst65_r, dst76_r, dst87_r);
1947    DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
1948              dst54_l, dst65_l, dst76_l, dst87_l);
1949
1950    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1951              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1952    DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1953              filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1954    DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
1955              filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
1956    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1957              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1958              dst0_r, dst0_l, dst1_r, dst1_l);
1959    DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1960              filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1961              dst2_r, dst2_l, dst3_r, dst3_l);
1962    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
1963              filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
1964              dst4_r, dst4_l, dst5_r, dst5_l);
1965
1966    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1967              dst0_r, dst0_l, dst1_r, dst1_l);
1968    DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
1969              dst2_r, dst2_l, dst3_r, dst3_l);
1970    DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6,
1971              dst4_r, dst4_l, dst5_r, dst5_l);
1972    DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
1973              dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
1974    DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
1975    DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
1976              tmp0, tmp1, tmp2, tmp3);
1977    DUP2_ARG2(__lsx_vsadd_h, in4, tmp4, in5, tmp5, tmp4, tmp5);
1978    DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
1979              tmp0, tmp1, tmp2, tmp3);
1980    DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 0, tmp4, tmp5);
1981    DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1982    out2 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
1983    __lsx_vstelm_d(out0, dst, 0, 0);
1984    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1985    __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
1986    __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
1987    dst += dst_stride_4x;
1988    __lsx_vstelm_d(out2, dst, 0, 0);
1989    __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
1990}
1991
1992static av_always_inline
1993void hevc_hv_4t_8multx4mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
1994                                int16_t *src1_ptr, int32_t src2_stride,
1995                                uint8_t *dst, int32_t dst_stride,
1996                                const int8_t *filter_x, const int8_t *filter_y,
1997                                int32_t height, int32_t width)
1998{
1999    uint32_t loop_cnt, cnt;
2000    uint8_t *src0_ptr_tmp;
2001    int16_t *src1_ptr_tmp;
2002    uint8_t *dst_tmp;
2003    const int32_t src_stride_2x = (src_stride << 1);
2004    const int32_t dst_stride_2x = (dst_stride << 1);
2005    const int32_t src_stride_4x = (src_stride << 2);
2006    const int32_t dst_stride_4x = (dst_stride << 2);
2007    const int32_t src2_stride_x = (src2_stride << 1);
2008    const int32_t src2_stride_2x = (src2_stride << 2);
2009    const int32_t src_stride_3x = src_stride_2x + src_stride;
2010    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2011    const int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
2012    __m128i out0, out1;
2013    __m128i src0, src1, src2, src3, src4, src5, src6;
2014    __m128i in0, in1, in2, in3;
2015    __m128i filt0, filt1;
2016    __m128i filt_h0, filt_h1;
2017    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
2018    __m128i mask1, filter_vec;
2019    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2020    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
2021    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2022    __m128i tmp0, tmp1, tmp2, tmp3;
2023    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
2024    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
2025    __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
2026
2027    src0_ptr -= (src_stride + 1);
2028
2029    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2030
2031    filter_vec = __lsx_vld(filter_y, 0);
2032    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2033
2034    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2035
2036    mask1 = __lsx_vaddi_bu(mask0, 2);
2037
2038    for (cnt = width >> 3; cnt--;) {
2039        src0_ptr_tmp = src0_ptr;
2040        dst_tmp = dst;
2041        src1_ptr_tmp = src1_ptr;
2042
2043        src0 = __lsx_vld(src0_ptr_tmp, 0);
2044        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
2045                  src_stride_2x, src1, src2);
2046        src0_ptr_tmp += src_stride_3x;
2047
2048        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
2049                  vec0, vec1);
2050        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
2051                  vec2, vec3);
2052        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
2053                  vec4, vec5);
2054
2055        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
2056        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
2057        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2058                  dst0, dst1);
2059        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
2060
2061        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2062        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2063
2064        for (loop_cnt = height >> 2; loop_cnt--;) {
2065            src3 = __lsx_vld(src0_ptr_tmp, 0);
2066            DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
2067                      src_stride_2x, src4, src5);
2068            src6 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
2069            src0_ptr_tmp += src_stride_4x;
2070            in0 = __lsx_vld(src1_ptr_tmp, 0);
2071            DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
2072                      src2_stride_2x, in1, in2);
2073            in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
2074            src1_ptr_tmp += src2_stride_2x;
2075
2076            DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
2077                      src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
2078            DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
2079                      src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
2080
2081            DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
2082                      vec6, filt0, dst3, dst4, dst5, dst6);
2083            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
2084                      filt1, dst5, vec5, filt1, dst6, vec7, filt1,
2085                      dst3, dst4, dst5, dst6);
2086
2087            DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
2088            DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
2089            DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
2090            DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
2091
2092            DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2093                      filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2094            DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
2095                      filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
2096            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
2097                      dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
2098                      dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
2099            DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
2100                      dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
2101                      dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
2102
2103            DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2104                      dst0_r, dst0_l, dst1_r, dst1_l);
2105            DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
2106                      dst2_r, dst2_l, dst3_r, dst3_l);
2107            DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
2108                      dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
2109            DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
2110                      tmp0, tmp1, tmp2, tmp3);
2111            DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, tmp0,
2112                      tmp1, tmp2, tmp3);
2113            DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
2114            __lsx_vstelm_d(out0, dst_tmp, 0, 0);
2115            __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
2116            __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
2117            __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
2118            dst_tmp += dst_stride_4x;
2119
2120            dst10_r = dst54_r;
2121            dst10_l = dst54_l;
2122            dst21_r = dst65_r;
2123            dst21_l = dst65_l;
2124            dst2 = dst6;
2125        }
2126
2127        src0_ptr += 8;
2128        dst += 8;
2129        src1_ptr += 8;
2130    }
2131}
2132
2133static void hevc_hv_4t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
2134                              int16_t *src1_ptr, int32_t src2_stride,
2135                              uint8_t *dst, int32_t dst_stride,
2136                              const int8_t *filter_x, const int8_t *filter_y,
2137                              int32_t height)
2138{
2139    if (2 == height) {
2140        hevc_hv_4t_8x2_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2141                           dst, dst_stride, filter_x, filter_y);
2142    } else if (4 == height) {
2143        hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2144                               dst, dst_stride, filter_x, filter_y, 1);
2145    } else if (6 == height) {
2146        hevc_hv_4t_8x6_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2147                           dst, dst_stride, filter_x, filter_y);
2148    } else {
2149        hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2150                                dst, dst_stride, filter_x, filter_y, height, 8);
2151    }
2152}
2153
2154static void hevc_hv_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
2155                               int16_t *src1_ptr, int32_t src2_stride,
2156                               uint8_t *dst, int32_t dst_stride,
2157                               const int8_t *filter_x, const int8_t *filter_y,
2158                               int32_t height)
2159{
2160    if (4 == height) {
2161        hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2162                               dst, dst_stride, filter_x, filter_y, 2);
2163    } else {
2164        hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2165                                dst, dst_stride, filter_x, filter_y, height, 16);
2166    }
2167}
2168
2169static void hevc_hv_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
2170                               int16_t *src1_ptr, int32_t src2_stride,
2171                               uint8_t *dst, int32_t dst_stride,
2172                               const int8_t *filter_x, const int8_t *filter_y,
2173                               int32_t height)
2174{
2175    hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2176                            dst, dst_stride, filter_x, filter_y, height, 24);
2177}
2178
2179static void hevc_hv_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
2180                               int16_t *src1_ptr, int32_t src2_stride,
2181                               uint8_t *dst, int32_t dst_stride,
2182                               const int8_t *filter_x, const int8_t *filter_y,
2183                               int32_t height)
2184{
2185    hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2186                            dst, dst_stride, filter_x, filter_y, height, 32);
2187}
2188
2189#define BI_MC_COPY(WIDTH)                                                 \
2190void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_lsx(uint8_t *dst,          \
2191                                                   ptrdiff_t dst_stride,  \
2192                                                   uint8_t *src,          \
2193                                                   ptrdiff_t src_stride,  \
2194                                                   int16_t *src_16bit,    \
2195                                                   int height,            \
2196                                                   intptr_t mx,           \
2197                                                   intptr_t my,           \
2198                                                   int width)             \
2199{                                                                         \
2200    hevc_bi_copy_##WIDTH##w_lsx(src, src_stride, src_16bit, MAX_PB_SIZE,  \
2201                                dst, dst_stride, height);                 \
2202}
2203
2204BI_MC_COPY(4);
2205BI_MC_COPY(6);
2206BI_MC_COPY(8);
2207BI_MC_COPY(12);
2208BI_MC_COPY(16);
2209BI_MC_COPY(24);
2210BI_MC_COPY(32);
2211BI_MC_COPY(48);
2212BI_MC_COPY(64);
2213
2214#undef BI_MC_COPY
2215
2216#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
2217void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,          \
2218                                                      ptrdiff_t dst_stride,  \
2219                                                      uint8_t *src,          \
2220                                                      ptrdiff_t src_stride,  \
2221                                                      int16_t *src_16bit,    \
2222                                                      int height,            \
2223                                                      intptr_t mx,           \
2224                                                      intptr_t my,           \
2225                                                      int width)             \
2226{                                                                            \
2227    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];            \
2228                                                                             \
2229    hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit,        \
2230                                          MAX_PB_SIZE, dst, dst_stride,      \
2231                                          filter, height);                   \
2232}
2233
2234BI_MC(qpel, h, 16, 8, hz, mx);
2235BI_MC(qpel, h, 24, 8, hz, mx);
2236BI_MC(qpel, h, 32, 8, hz, mx);
2237BI_MC(qpel, h, 48, 8, hz, mx);
2238BI_MC(qpel, h, 64, 8, hz, mx);
2239
2240BI_MC(qpel, v, 8, 8, vt, my);
2241BI_MC(qpel, v, 16, 8, vt, my);
2242BI_MC(qpel, v, 24, 8, vt, my);
2243BI_MC(qpel, v, 32, 8, vt, my);
2244BI_MC(qpel, v, 48, 8, vt, my);
2245BI_MC(qpel, v, 64, 8, vt, my);
2246
2247BI_MC(epel, h, 24, 4, hz, mx);
2248BI_MC(epel, h, 32, 4, hz, mx);
2249
2250BI_MC(epel, v, 12, 4, vt, my);
2251BI_MC(epel, v, 16, 4, vt, my);
2252BI_MC(epel, v, 24, 4, vt, my);
2253BI_MC(epel, v, 32, 4, vt, my);
2254
2255#undef BI_MC
2256
2257#define BI_MC_HV(PEL, WIDTH, TAP)                                         \
2258void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst,          \
2259                                                   ptrdiff_t dst_stride,  \
2260                                                   uint8_t *src,          \
2261                                                   ptrdiff_t src_stride,  \
2262                                                   int16_t *src_16bit,    \
2263                                                   int height,            \
2264                                                   intptr_t mx,           \
2265                                                   intptr_t my,           \
2266                                                   int width)             \
2267{                                                                         \
2268    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];             \
2269    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];             \
2270                                                                          \
2271    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit,           \
2272                                    MAX_PB_SIZE, dst, dst_stride,         \
2273                                    filter_x, filter_y, height);          \
2274}
2275
2276BI_MC_HV(qpel, 8, 8);
2277BI_MC_HV(qpel, 16, 8);
2278BI_MC_HV(qpel, 24, 8);
2279BI_MC_HV(qpel, 32, 8);
2280BI_MC_HV(qpel, 48, 8);
2281BI_MC_HV(qpel, 64, 8);
2282
2283BI_MC_HV(epel, 8, 4);
2284BI_MC_HV(epel, 6, 4);
2285BI_MC_HV(epel, 16, 4);
2286BI_MC_HV(epel, 24, 4);
2287BI_MC_HV(epel, 32, 4);
2288
2289#undef BI_MC_HV
2290