1/*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * Contributed by Hao Chen <chenhao@loongson.cn>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavcodec/vp9dsp.h"
23#include "libavutil/loongarch/loongson_intrinsics.h"
24#include "vp9dsp_loongarch.h"
25
26static const uint8_t mc_filt_mask_arr[16 * 3] = {
27    /* 8 width cases */
28    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29    /* 4 width cases */
30    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31    /* 4 width cases */
32    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33};
34
35
36#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3,                 \
37                                   _mask0, _mask1, _mask2, _mask3,             \
38                                   _filter0, _filter1, _filter2, _filter3,     \
39                                   _out0, _out1)                               \
40{                                                                              \
41    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
42    __m128i _reg0, _reg1, _reg2, _reg3;                                        \
43                                                                               \
44    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0,       \
45              _tmp0, _tmp1);                                                   \
46    DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
47    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1,       \
48               _tmp2, _tmp3);                                                  \
49    DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3,         \
50              _filter1, _reg0, _reg1);                                         \
51    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2,       \
52               _tmp4, _tmp5);                                                  \
53    DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
54    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3,       \
55               _tmp6, _tmp7);                                                  \
56    DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7,         \
57              _filter3, _reg2, _reg3);                                         \
58    DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1);        \
59}
60
61#define HORIZ_8TAP_8WID_4VECS_FILT(_src0, _src1, _src2, _src3,                 \
62                                   _mask0, _mask1, _mask2, _mask3,             \
63                                   _filter0, _filter1, _filter2, _filter3,     \
64                                   _out0, _out1, _out2, _out3)                 \
65{                                                                              \
66    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
67    __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7;            \
68                                                                               \
69    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, _src2,\
70              _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, _tmp3);\
71    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2,         \
72              _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3);          \
73    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, _src2,\
74              _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, _tmp3);\
75    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2,         \
76              _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7);          \
77    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, _src2,\
78              _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, _tmp7);\
79    DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5,         \
80              _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
81              _reg1, _reg2, _reg3);                                            \
82    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, _src2,\
83              _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, _tmp7);\
84    DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5,         \
85              _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
86              _reg5, _reg6, _reg7);                                            \
87    DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3,  \
88              _reg7, _out0, _out1, _out2, _out3);                              \
89}
90
91#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3,                        \
92                             _filter0, _filter1, _filter2, _filter3)           \
93( {                                                                            \
94    __m128i _vec0, _vec1;                                                      \
95                                                                               \
96    _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);                                   \
97    _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);                         \
98    _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);                                   \
99    _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);                         \
100    _vec0 = __lsx_vsadd_h(_vec0, _vec1);                                       \
101                                                                               \
102    _vec0;                                                                     \
103} )
104
105#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3,          \
106                        _filt_h0, _filt_h1, _filt_h2, _filt_h3)                \
107( {                                                                            \
108    __m128i _tmp0, _tmp1, _tmp2, _tmp3;                                        \
109    __m128i _out;                                                              \
110                                                                               \
111    DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,\
112              _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);\
113    _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \
114                               _filt_h2, _filt_h3);                            \
115    _out = __lsx_vsrari_h(_out, 7);                                            \
116    _out = __lsx_vsat_h(_out, 7);                                              \
117                                                                               \
118    _out;                                                                      \
119} )
120
121#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3)               \
122{                                                                         \
123    _src0 = __lsx_vld(_src, 0);                                           \
124    _src += _stride;                                                      \
125    _src1 = __lsx_vld(_src, 0);                                           \
126    _src += _stride;                                                      \
127    _src2 = __lsx_vld(_src, 0);                                           \
128    _src += _stride;                                                      \
129    _src3 = __lsx_vld(_src, 0);                                           \
130}
131
132static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
133                                 uint8_t *dst, int32_t dst_stride,
134                                 const int8_t *filter)
135{
136    __m128i src0, src1, src2, src3;
137    __m128i filter0, filter1, filter2, filter3;
138    __m128i mask0, mask1, mask2, mask3;
139    __m128i out, out0, out1;
140
141    mask0 = __lsx_vld(mc_filt_mask_arr, 16);
142    src -= 3;
143    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
144              filter0, filter1, filter2, filter3);
145    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
146    mask3 = __lsx_vaddi_bu(mask0, 6);
147
148    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
149    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
150              src0, src1, src2, src3);
151    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
152                     mask3, filter0, filter1, filter2, filter3, out0, out1);
153    out = __lsx_vssrarni_b_h(out1, out0, 7);
154    out = __lsx_vxori_b(out, 128);
155    __lsx_vstelm_w(out, dst, 0, 0);
156    dst += dst_stride;
157    __lsx_vstelm_w(out, dst, 0, 1);
158    dst += dst_stride;
159    __lsx_vstelm_w(out, dst, 0, 2);
160    dst += dst_stride;
161    __lsx_vstelm_w(out, dst, 0, 3);
162}
163
164static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
165                                 uint8_t *dst, int32_t dst_stride,
166                                 const int8_t *filter)
167{
168    int32_t src_stride2 = src_stride << 1;
169    int32_t src_stride3 = src_stride + src_stride2;
170    int32_t src_stride4 = src_stride2 << 1;
171    __m128i src0, src1, src2, src3;
172    __m128i filter0, filter1, filter2, filter3;
173    __m128i mask0, mask1, mask2, mask3;
174    __m128i out0, out1, out2, out3;
175    uint8_t *_src = (uint8_t*)src - 3;
176
177    mask0 = __lsx_vld(mc_filt_mask_arr, 16);
178    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
179    mask3 = __lsx_vaddi_bu(mask0, 6);
180    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
181              filter0, filter1, filter2, filter3);
182
183    src0 = __lsx_vld(_src, 0);
184    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
185    src3 = __lsx_vldx(_src, src_stride3);
186    _src += src_stride4;
187    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
188              src0, src1, src2, src3);
189    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
190                     mask3, filter0, filter1, filter2, filter3, out0, out1);
191    src0 = __lsx_vld(_src, 0);
192    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
193    src3 = __lsx_vldx(_src, src_stride3);
194    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
195              src0, src1, src2, src3);
196    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
197                     mask3, filter0, filter1, filter2, filter3, out2, out3);
198    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
199    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
200    __lsx_vstelm_w(out0, dst, 0, 0);
201    dst += dst_stride;
202    __lsx_vstelm_w(out0, dst, 0, 1);
203    dst += dst_stride;
204    __lsx_vstelm_w(out0, dst, 0, 2);
205    dst += dst_stride;
206    __lsx_vstelm_w(out0, dst, 0, 3);
207    dst += dst_stride;
208    __lsx_vstelm_w(out1, dst, 0, 0);
209    dst += dst_stride;
210    __lsx_vstelm_w(out1, dst, 0, 1);
211    dst += dst_stride;
212    __lsx_vstelm_w(out1, dst, 0, 2);
213    dst += dst_stride;
214    __lsx_vstelm_w(out1, dst, 0, 3);
215}
216
217static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
218                                uint8_t *dst, int32_t dst_stride,
219                                const int8_t *filter, int32_t height)
220{
221    if (height == 4) {
222        common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
223    } else if (height == 8) {
224        common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
225    }
226}
227
228static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
229                                 uint8_t *dst, int32_t dst_stride,
230                                 const int8_t *filter)
231{
232    __m128i src0, src1, src2, src3;
233    __m128i filter0, filter1, filter2, filter3;
234    __m128i mask0, mask1, mask2, mask3;
235    __m128i out0, out1, out2, out3;
236
237    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
238    src -= 3;
239    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
240    mask3 = __lsx_vaddi_bu(mask0, 6);
241    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
242              filter0, filter1, filter2, filter3);
243
244    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
245    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
246              src0, src1, src2, src3);
247    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
248         mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
249    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
250    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
251    __lsx_vstelm_d(out0, dst, 0, 0);
252    dst += dst_stride;
253    __lsx_vstelm_d(out0, dst, 0, 1);
254    dst += dst_stride;
255    __lsx_vstelm_d(out1, dst, 0, 0);
256    dst += dst_stride;
257    __lsx_vstelm_d(out1, dst, 0, 1);
258}
259
260static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
261                                     uint8_t *dst, int32_t dst_stride,
262                                     const int8_t *filter, int32_t height)
263{
264    uint32_t loop_cnt = height >> 2;
265    int32_t src_stride2 = src_stride << 1;
266    int32_t src_stride3 = src_stride + src_stride2;
267    int32_t src_stride4 = src_stride2 << 1;
268    __m128i src0, src1, src2, src3;
269    __m128i filter0, filter1, filter2, filter3;
270    __m128i mask0, mask1, mask2, mask3;
271    __m128i out0, out1, out2, out3;
272    uint8_t* _src = (uint8_t*)src - 3;
273
274    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
275    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
276    mask3 = __lsx_vaddi_bu(mask0, 6);
277    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
278              filter0, filter1, filter2, filter3);
279
280    for (; loop_cnt--;) {
281        src0 = __lsx_vld(_src, 0);
282        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
283        src3 = __lsx_vldx(_src, src_stride3);
284        _src += src_stride4;
285        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
286                  src0, src1, src2, src3);
287        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
288             mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
289        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
290        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
291        __lsx_vstelm_d(out0, dst, 0, 0);
292        dst += dst_stride;
293        __lsx_vstelm_d(out0, dst, 0, 1);
294        dst += dst_stride;
295        __lsx_vstelm_d(out1, dst, 0, 0);
296        dst += dst_stride;
297        __lsx_vstelm_d(out1, dst, 0, 1);
298        dst += dst_stride;
299    }
300}
301
302static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
303                                uint8_t *dst, int32_t dst_stride,
304                                const int8_t *filter, int32_t height)
305{
306    if (height == 4) {
307        common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
308    } else {
309        common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride,
310                                 filter, height);
311    }
312}
313
314static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
315                                 uint8_t *dst, int32_t dst_stride,
316                                 const int8_t *filter, int32_t height)
317{
318    uint32_t loop_cnt = height >> 1;
319    int32_t stride = src_stride << 1;
320    __m128i src0, src1, src2, src3;
321    __m128i filter0, filter1, filter2, filter3;
322    __m128i mask0, mask1, mask2, mask3;
323    __m128i out0, out1, out2, out3;
324
325    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
326    src -= 3;
327    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
328    mask3 = __lsx_vaddi_bu(mask0, 6);
329    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
330              filter0, filter1, filter2, filter3);
331
332    for (; loop_cnt--;) {
333        const uint8_t* _src = src + src_stride;
334        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
335        DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
336        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
337                  src0, src1, src2, src3);
338        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339             mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
340        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
341        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
342        __lsx_vst(out0, dst, 0);
343        dst += dst_stride;
344        __lsx_vst(out1, dst, 0);
345        dst += dst_stride;
346        src += stride;
347    }
348}
349
350static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
351                                 uint8_t *dst, int32_t dst_stride,
352                                 const int8_t *filter, int32_t height)
353{
354    uint32_t loop_cnt = height >> 1;
355    __m128i src0, src1, src2, src3;
356    __m128i filter0, filter1, filter2, filter3;
357    __m128i mask0, mask1, mask2, mask3;
358    __m128i out0, out1, out2, out3;
359    __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
360
361    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
362    src -= 3;
363    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
364    mask3 = __lsx_vaddi_bu(mask0, 6);
365    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
366              filter0, filter1, filter2, filter3);
367
368    for (; loop_cnt--;) {
369        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
370        src3 = __lsx_vld(src, 24);
371        src1 = __lsx_vshuf_b(src2, src0, shuff);
372        src += src_stride;
373        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
374                  src0, src1, src2, src3);
375        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
376             mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
377        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
378        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
379        __lsx_vst(out0, dst, 0);
380        __lsx_vst(out1, dst, 16);
381
382        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
383        src3 = __lsx_vld(src, 24);
384        src1 = __lsx_vshuf_b(src2, src0, shuff);
385        src += src_stride;
386
387        dst += dst_stride;
388        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
389                  src0, src1, src2, src3);
390        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
391             mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
392        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
393        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
394        __lsx_vst(out0, dst, 0);
395        __lsx_vst(out1, dst, 16);
396        dst += dst_stride;
397    }
398}
399
400static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
401                                 uint8_t *dst, int32_t dst_stride,
402                                 const int8_t *filter, int32_t height)
403{
404    int32_t loop_cnt = height;
405    __m128i src0, src1, src2, src3;
406    __m128i filter0, filter1, filter2, filter3;
407    __m128i mask0, mask1, mask2, mask3;
408    __m128i out0, out1, out2, out3;
409    __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
410
411    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
412    src -= 3;
413    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
414    mask3 = __lsx_vaddi_bu(mask0, 6);
415    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
416              filter0, filter1, filter2, filter3);
417
418    for (; loop_cnt--;) {
419        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
420        src3 = __lsx_vld(src, 24);
421        src1 = __lsx_vshuf_b(src2, src0, shuff);
422        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
423                  src0, src1, src2, src3);
424        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
425             mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
426        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
427        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
428        __lsx_vst(out0, dst, 0);
429        __lsx_vst(out1, dst, 16);
430
431        DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
432        src3 = __lsx_vld(src, 56);
433        src1 = __lsx_vshuf_b(src2, src0, shuff);
434        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
435                  src0, src1, src2, src3);
436        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
437             mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
438        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
439        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
440        __lsx_vst(out0, dst, 32);
441        __lsx_vst(out1, dst, 48);
442        src += src_stride;
443        dst += dst_stride;
444    }
445}
446
447static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
448                                uint8_t *dst, int32_t dst_stride,
449                                const int8_t *filter, int32_t height)
450{
451    uint32_t loop_cnt = height >> 2;
452    int32_t src_stride2 = src_stride << 1;
453    int32_t src_stride3 = src_stride + src_stride2;
454    int32_t src_stride4 = src_stride2 << 1;
455    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
456    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
457    __m128i reg0, reg1, reg2, reg3, reg4;
458    __m128i filter0, filter1, filter2, filter3;
459    __m128i out0, out1;
460    uint8_t* _src = (uint8_t*)src - src_stride3;
461
462    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
463              filter0, filter1, filter2, filter3);
464    src0 = __lsx_vld(_src, 0);
465    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
466    src3 = __lsx_vldx(_src, src_stride3);
467    _src += src_stride4;
468    src4 = __lsx_vld(_src, 0);
469    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
470    _src += src_stride3;
471    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
472              tmp1, tmp2, tmp3);
473    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
474    DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
475    reg2 = __lsx_vilvl_d(tmp5, tmp2);
476    DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
477    reg2 = __lsx_vxori_b(reg2, 128);
478
479    for (;loop_cnt--;) {
480        src7 = __lsx_vld(_src, 0);
481        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
482        src10 = __lsx_vldx(_src, src_stride3);
483        _src += src_stride4;
484        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
485                  src9, tmp0, tmp1, tmp2, tmp3);
486        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
487        DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
488        out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
489                                   filter2, filter3);
490        out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
491                                   filter2, filter3);
492        out0 = __lsx_vssrarni_b_h(out1, out0, 7);
493        out0 = __lsx_vxori_b(out0, 128);
494        __lsx_vstelm_w(out0, dst, 0, 0);
495        dst += dst_stride;
496        __lsx_vstelm_w(out0, dst, 0, 1);
497        dst += dst_stride;
498        __lsx_vstelm_w(out0, dst, 0, 2);
499        dst += dst_stride;
500        __lsx_vstelm_w(out0, dst, 0, 3);
501        dst += dst_stride;
502
503        reg0 = reg2;
504        reg1 = reg3;
505        reg2 = reg4;
506        src6 = src10;
507    }
508}
509
510static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
511                                uint8_t *dst, int32_t dst_stride,
512                                const int8_t *filter, int32_t height)
513{
514    uint32_t loop_cnt = height >> 2;
515    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
516    __m128i tmp0, tmp1, tmp2, tmp3;
517    __m128i reg0, reg1, reg2, reg3, reg4, reg5;
518    __m128i filter0, filter1, filter2, filter3;
519    __m128i out0, out1, out2, out3;
520    int32_t src_stride2 = src_stride << 1;
521    int32_t src_stride3 = src_stride + src_stride2;
522    int32_t src_stride4 = src_stride2 << 1;
523    uint8_t* _src = (uint8_t*)src - src_stride3;
524
525    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
526              filter0, filter1, filter2, filter3);
527
528    src0 = __lsx_vld(_src, 0);
529    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
530    src3 = __lsx_vldx(_src, src_stride3);
531    _src += src_stride4;
532    src4 = __lsx_vld(_src, 0);
533    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
534    _src += src_stride3;
535
536    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
537              src0, src1, src2, src3);
538    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
539    src6 = __lsx_vxori_b(src6, 128);
540    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
541              reg0, reg1, reg2, reg3);
542    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
543
544    for (;loop_cnt--;) {
545        src7 = __lsx_vld(_src, 0);
546        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
547        src10 = __lsx_vldx(_src, src_stride3);
548        _src += src_stride4;
549        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
550                  src7, src8, src9, src10);
551        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
552                  src9, tmp0, tmp1, tmp2, tmp3);
553        out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
554                                   filter2, filter3);
555        out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
556                                   filter2, filter3);
557        out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
558                                   filter2, filter3);
559        out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
560                                   filter2, filter3);
561        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
562        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
563        __lsx_vstelm_d(out0, dst, 0, 0);
564        dst += dst_stride;
565        __lsx_vstelm_d(out0, dst, 0, 1);
566        dst += dst_stride;
567        __lsx_vstelm_d(out1, dst, 0, 0);
568        dst += dst_stride;
569        __lsx_vstelm_d(out1, dst, 0, 1);
570        dst += dst_stride;
571
572        reg0 = reg2;
573        reg1 = tmp0;
574        reg2 = tmp2;
575        reg3 = reg5;
576        reg4 = tmp1;
577        reg5 = tmp3;
578        src6 = src10;
579    }
580}
581
582static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
583                                 uint8_t *dst, int32_t dst_stride,
584                                 const int8_t *filter, int32_t height)
585{
586    uint32_t loop_cnt = height >> 2;
587    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
588    __m128i filter0, filter1, filter2, filter3;
589    __m128i reg0, reg1, reg2, reg3, reg4, reg5;
590    __m128i reg6, reg7, reg8, reg9, reg10, reg11;
591    __m128i tmp0, tmp1, tmp2, tmp3;
592    int32_t src_stride2 = src_stride << 1;
593    int32_t src_stride3 = src_stride + src_stride2;
594    int32_t src_stride4 = src_stride2 << 1;
595    uint8_t* _src = (uint8_t*)src - src_stride3;
596
597    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
598              filter0, filter1, filter2, filter3);
599    src0 = __lsx_vld(_src, 0);
600    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
601    src3 = __lsx_vldx(_src, src_stride3);
602    _src += src_stride4;
603    src4 = __lsx_vld(_src, 0);
604    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
605    _src += src_stride3;
606    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
607              src0, src1, src2, src3);
608    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
609    src6 = __lsx_vxori_b(src6, 128);
610    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
611              reg0, reg1, reg2, reg3);
612    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
613    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
614              reg6, reg7, reg8, reg9);
615    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
616
617    for (;loop_cnt--;) {
618        src7 = __lsx_vld(_src, 0);
619        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
620        src10 = __lsx_vldx(_src, src_stride3);
621        _src += src_stride4;
622        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
623                  src7, src8, src9, src10);
624        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
625                  src0, src1, src2, src3);
626        DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
627                  src4, src5, src7, src8);
628        tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
629                                   filter2, filter3);
630        tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
631                                   filter2, filter3);
632        tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
633                                   filter2, filter3);
634        tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
635                                   filter2, filter3);
636        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
637        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
638        __lsx_vst(tmp0, dst, 0);
639        dst += dst_stride;
640        __lsx_vst(tmp1, dst, 0);
641        dst += dst_stride;
642        tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
643                                   filter2, filter3);
644        tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
645                                   filter2, filter3);
646        tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
647                                   filter2, filter3);
648        tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
649                                   filter2, filter3);
650        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
651        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
652        __lsx_vst(tmp0, dst, 0);
653        dst += dst_stride;
654        __lsx_vst(tmp1, dst, 0);
655        dst += dst_stride;
656
657        reg0 = reg2;
658        reg1 = src0;
659        reg2 = src2;
660        reg3 = reg5;
661        reg4 = src1;
662        reg5 = src3;
663        reg6 = reg8;
664        reg7 = src4;
665        reg8 = src7;
666        reg9 = reg11;
667        reg10 = src5;
668        reg11 = src8;
669        src6 = src10;
670    }
671}
672
673static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
674                                      uint8_t *dst, int32_t dst_stride,
675                                      const int8_t *filter, int32_t height,
676                                      int32_t width)
677{
678    uint8_t *src_tmp;
679    uint8_t *dst_tmp;
680    uint32_t cnt = width >> 4;
681    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
682    __m128i filter0, filter1, filter2, filter3;
683    __m128i reg0, reg1, reg2, reg3, reg4, reg5;
684    __m128i reg6, reg7, reg8, reg9, reg10, reg11;
685    __m128i tmp0, tmp1, tmp2, tmp3;
686    int32_t src_stride2 = src_stride << 1;
687    int32_t src_stride3 = src_stride + src_stride2;
688    int32_t src_stride4 = src_stride2 << 1;
689    int32_t dst_stride2 = dst_stride << 1;
690    int32_t dst_stride3 = dst_stride2 + dst_stride;
691    int32_t dst_stride4 = dst_stride2 << 1;
692    uint8_t* _src = (uint8_t*)src - src_stride3;
693
694    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
695              filter0, filter1, filter2, filter3);
696    for (;cnt--;) {
697        uint32_t loop_cnt = height >> 2;
698
699        src_tmp = _src;
700        dst_tmp = dst;
701
702        src0 = __lsx_vld(src_tmp, 0);
703        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
704                  src1, src2);
705        src3 = __lsx_vldx(src_tmp, src_stride3);
706        src_tmp += src_stride4;
707        src4 = __lsx_vld(src_tmp, 0);
708        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
709                  src5, src6);
710        src_tmp += src_stride3;
711
712        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
713                  src0, src1, src2, src3);
714        DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
715        src6 = __lsx_vxori_b(src6, 128);
716        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
717                  reg0, reg1, reg2, reg3);
718        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
719        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
720                  reg6, reg7, reg8, reg9);
721        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
722
723        for (;loop_cnt--;) {
724            src7 = __lsx_vld(src_tmp, 0);
725            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
726                      src8, src9);
727            src10 = __lsx_vldx(src_tmp, src_stride3);
728            src_tmp += src_stride4;
729            DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
730                      128, src7, src8, src9, src10);
731            DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
732                      src10, src9, src0, src1, src2, src3);
733            DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
734                      src10, src9, src4, src5, src7, src8);
735            tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0,
736                                       filter1, filter2, filter3);
737            tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0,
738                                       filter1, filter2, filter3);
739            tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0,
740                                       filter1, filter2, filter3);
741            tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0,
742                                       filter1, filter2, filter3);
743            DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
744                      tmp0, tmp1);
745            DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
746            __lsx_vst(tmp0, dst_tmp, 0);
747            __lsx_vstx(tmp1, dst_tmp, dst_stride);
748            tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0,
749                                       filter1, filter2, filter3);
750            tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0,
751                                       filter1, filter2, filter3);
752            tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0,
753                                       filter1, filter2, filter3);
754            tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0,
755                                       filter1, filter2, filter3);
756            DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
757                      tmp0, tmp1);
758            DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
759            __lsx_vstx(tmp0, dst_tmp, dst_stride2);
760            __lsx_vstx(tmp1, dst_tmp, dst_stride3);
761            dst_tmp += dst_stride4;
762
763            reg0 = reg2;
764            reg1 = src0;
765            reg2 = src2;
766            reg3 = reg5;
767            reg4 = src1;
768            reg5 = src3;
769            reg6 = reg8;
770            reg7 = src4;
771            reg8 = src7;
772            reg9 = reg11;
773            reg10 = src5;
774            reg11 = src8;
775            src6 = src10;
776        }
777        _src += 16;
778        dst  += 16;
779    }
780}
781
782static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
783                                 uint8_t *dst, int32_t dst_stride,
784                                 const int8_t *filter, int32_t height)
785{
786    common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
787}
788
789static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
790                                 uint8_t *dst, int32_t dst_stride,
791                                 const int8_t *filter, int32_t height)
792{
793    common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride,
794                              filter, height, 64);
795}
796
797static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
798                                     uint8_t *dst, int32_t dst_stride,
799                                     const int8_t *filter_horiz,
800                                     const int8_t *filter_vert,
801                                     int32_t height)
802{
803    uint32_t loop_cnt = height >> 2;
804    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
805    __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
806    __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
807    __m128i mask0, mask1, mask2, mask3;
808    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
809    __m128i out0, out1;
810    __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
811    int32_t src_stride2 = src_stride << 1;
812    int32_t src_stride3 = src_stride + src_stride2;
813    int32_t src_stride4 = src_stride2 << 1;
814    uint8_t* _src = (uint8_t*)src - src_stride3 - 3;
815
816    mask0 = __lsx_vld(mc_filt_mask_arr, 16);
817    DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
818              filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
819    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
820    mask3 = __lsx_vaddi_bu(mask0, 6);
821
822    src0 = __lsx_vld(_src, 0);
823    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
824    src3 = __lsx_vldx(_src, src_stride3);
825    _src += src_stride4;
826    src4 = __lsx_vld(_src, 0);
827    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
828    _src += src_stride3;
829    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
830              src0, src1, src2, src3);
831    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
832    src6 = __lsx_vxori_b(src6, 128);
833
834    tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
835                           filt_hz1, filt_hz2, filt_hz3);
836    tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
837                           filt_hz1, filt_hz2, filt_hz3);
838    tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
839                           filt_hz1, filt_hz2, filt_hz3);
840    tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
841                           filt_hz1, filt_hz2, filt_hz3);
842    DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
843    DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
844              filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
845    DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
846    tmp2 = __lsx_vpackev_b(tmp5, tmp4);
847
848    for (;loop_cnt--;) {
849        src7 = __lsx_vld(_src, 0);
850        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
851        src10 = __lsx_vldx(_src, src_stride3);
852        _src += src_stride4;
853        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
854                  src7, src8, src9, src10);
855        tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
856                               filt_hz1, filt_hz2, filt_hz3);
857        tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
858        tmp4 = __lsx_vpackev_b(tmp3, tmp4);
859        out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
860                                   filt_vt2, filt_vt3);
861        src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
862                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
863        src0 = __lsx_vshuf_b(src1, tmp3, shuff);
864        src0 = __lsx_vpackev_b(src1, src0);
865        out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
866                                   filt_vt2, filt_vt3);
867        out0 = __lsx_vssrarni_b_h(out1, out0, 7);
868        out0 = __lsx_vxori_b(out0, 128);
869        __lsx_vstelm_w(out0, dst, 0, 0);
870        dst += dst_stride;
871        __lsx_vstelm_w(out0, dst, 0, 1);
872        dst += dst_stride;
873        __lsx_vstelm_w(out0, dst, 0, 2);
874        dst += dst_stride;
875        __lsx_vstelm_w(out0, dst, 0, 3);
876        dst += dst_stride;
877
878        tmp5 = src1;
879        tmp0 = tmp2;
880        tmp1 = tmp4;
881        tmp2 = src0;
882    }
883}
884
885static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
886                                     uint8_t *dst, int32_t dst_stride,
887                                     const int8_t *filter_horiz,
888                                     const int8_t *filter_vert,
889                                     int32_t height)
890{
891    uint32_t loop_cnt = height >> 2;
892    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
893    __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
894    __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
895    __m128i mask0, mask1, mask2, mask3;
896    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
897    __m128i out0, out1;
898    int32_t src_stride2 = src_stride << 1;
899    int32_t src_stride3 = src_stride + src_stride2;
900    int32_t src_stride4 = src_stride2 << 1;
901    uint8_t* _src = (uint8_t*)src - src_stride3 - 3;
902
903    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
904    DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
905              4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
906    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
907    mask3 = __lsx_vaddi_bu(mask0, 6);
908
909    src0 = __lsx_vld(_src, 0);
910    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
911    src3 = __lsx_vldx(_src, src_stride3);
912    _src += src_stride4;
913    src4 = __lsx_vld(_src, 0);
914    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
915    _src += src_stride3;
916    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
917              src0, src1, src2, src3);
918    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
919    src6 = __lsx_vxori_b(src6, 128);
920
921    src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
922                           filt_hz1, filt_hz2, filt_hz3);
923    src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
924                           filt_hz1, filt_hz2, filt_hz3);
925    src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
926                           filt_hz1, filt_hz2, filt_hz3);
927    src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
928                           filt_hz1, filt_hz2, filt_hz3);
929    src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
930                           filt_hz1, filt_hz2, filt_hz3);
931    src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
932                           filt_hz1, filt_hz2, filt_hz3);
933    src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
934                           filt_hz1, filt_hz2, filt_hz3);
935
936    DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
937              filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
938    DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4,
939              src2, src1, tmp0, tmp1, tmp2, tmp4);
940    DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
941
942    for (;loop_cnt--;) {
943        src7 = __lsx_vld(_src, 0);
944        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
945        src10 = __lsx_vldx(_src, src_stride3);
946        _src += src_stride4;
947
948        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
949                  src7, src8, src9, src10);
950        src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
951                               filt_hz1, filt_hz2, filt_hz3);
952        tmp3 = __lsx_vpackev_b(src7, src6);
953        out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
954                                   filt_vt2, filt_vt3);
955        src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
956                               filt_hz1, filt_hz2, filt_hz3);
957        src0 = __lsx_vpackev_b(src8, src7);
958        out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
959                                   filt_vt2, filt_vt3);
960        src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
961                               filt_hz1, filt_hz2, filt_hz3);
962        src1 = __lsx_vpackev_b(src9, src8);
963        src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
964                                   filt_vt2, filt_vt3);
965        src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
966                                filt_hz0, filt_hz1, filt_hz2, filt_hz3);
967        src2 = __lsx_vpackev_b(src10, src9);
968        src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
969                                   filt_vt2, filt_vt3);
970        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
971        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
972        __lsx_vstelm_d(out0, dst, 0, 0);
973        dst += dst_stride;
974        __lsx_vstelm_d(out0, dst, 0, 1);
975        dst += dst_stride;
976        __lsx_vstelm_d(out1, dst, 0, 0);
977        dst += dst_stride;
978        __lsx_vstelm_d(out1, dst, 0, 1);
979        dst += dst_stride;
980
981        src6 = src10;
982        tmp0 = tmp2;
983        tmp1 = tmp3;
984        tmp2 = src1;
985        tmp4 = tmp6;
986        tmp5 = src0;
987        tmp6 = src2;
988    }
989}
990
991static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
992                                      uint8_t *dst, int32_t dst_stride,
993                                      const int8_t *filter_horiz,
994                                      const int8_t *filter_vert,
995                                      int32_t height)
996{
997    int32_t multiple8_cnt;
998
999    for (multiple8_cnt = 2; multiple8_cnt--;) {
1000        common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1001                                 filter_vert, height);
1002        src += 8;
1003        dst += 8;
1004    }
1005}
1006
1007static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
1008                                      uint8_t *dst, int32_t dst_stride,
1009                                      const int8_t *filter_horiz,
1010                                      const int8_t *filter_vert,
1011                                      int32_t height)
1012{
1013    int32_t multiple8_cnt;
1014
1015    for (multiple8_cnt = 4; multiple8_cnt--;) {
1016        common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1017                                 filter_vert, height);
1018        src += 8;
1019        dst += 8;
1020    }
1021}
1022
1023static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
1024                                      uint8_t *dst, int32_t dst_stride,
1025                                      const int8_t *filter_horiz,
1026                                      const int8_t *filter_vert,
1027                                      int32_t height)
1028{
1029    int32_t multiple8_cnt;
1030
1031    for (multiple8_cnt = 8; multiple8_cnt--;) {
1032        common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1033                                 filter_vert, height);
1034        src += 8;
1035        dst += 8;
1036    }
1037}
1038
1039static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
1040                            uint8_t *dst, int32_t dst_stride,
1041                            int32_t height)
1042{
1043    int32_t cnt = height >> 2;
1044    __m128i src0, src1, src2, src3;
1045
1046    for (;cnt--;) {
1047        src0 = __lsx_vldrepl_d(src, 0);
1048        src += src_stride;
1049        src1 = __lsx_vldrepl_d(src, 0);
1050        src += src_stride;
1051        src2 = __lsx_vldrepl_d(src, 0);
1052        src += src_stride;
1053        src3 = __lsx_vldrepl_d(src, 0);
1054        src += src_stride;
1055        __lsx_vstelm_d(src0, dst, 0, 0);
1056        dst += dst_stride;
1057        __lsx_vstelm_d(src1, dst, 0, 0);
1058        dst += dst_stride;
1059        __lsx_vstelm_d(src2, dst, 0, 0);
1060        dst += dst_stride;
1061        __lsx_vstelm_d(src3, dst, 0, 0);
1062        dst += dst_stride;
1063    }
1064}
1065
1066static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
1067                             uint8_t *dst, int32_t dst_stride,
1068                             int32_t height)
1069{
1070    int32_t cnt = height >> 2;
1071    __m128i src0, src1, src2, src3;
1072    int32_t src_stride2 = src_stride << 1;
1073    int32_t src_stride3 = src_stride + src_stride2;
1074    int32_t src_stride4 = src_stride2 << 1;
1075    int32_t dst_stride2 = dst_stride << 1;
1076    int32_t dst_stride3 = dst_stride2 + dst_stride;
1077    int32_t dst_stride4 = dst_stride2 << 1;
1078    uint8_t *_src = (uint8_t*)src;
1079
1080    for (;cnt--;) {
1081        src0 = __lsx_vld(_src, 0);
1082        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1083        src3 = __lsx_vldx(_src, src_stride3);
1084        _src += src_stride4;
1085        __lsx_vst(src0, dst, 0);
1086        __lsx_vstx(src1, dst, dst_stride);
1087        __lsx_vstx(src2, dst, dst_stride2);
1088        __lsx_vstx(src3, dst, dst_stride3);
1089        dst += dst_stride4;
1090    }
1091}
1092
1093static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
1094                             uint8_t *dst, int32_t dst_stride,
1095                             int32_t height)
1096{
1097    int32_t cnt = height >> 2;
1098    uint8_t *src_tmp1 = (uint8_t*)src;
1099    uint8_t *dst_tmp1 = dst;
1100    uint8_t *src_tmp2 = src_tmp1 + 16;
1101    uint8_t *dst_tmp2 = dst_tmp1 + 16;
1102    int32_t src_stride2 = src_stride << 1;
1103    int32_t src_stride3 = src_stride + src_stride2;
1104    int32_t src_stride4 = src_stride2 << 1;
1105    int32_t dst_stride2 = dst_stride << 1;
1106    int32_t dst_stride3 = dst_stride2 + dst_stride;
1107    int32_t dst_stride4 = dst_stride2 << 1;
1108    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1109
1110    for (;cnt--;) {
1111        src0 = __lsx_vld(src_tmp1, 0);
1112        DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2,
1113                  src1, src2);
1114        src3 = __lsx_vldx(src_tmp1, src_stride3);
1115        src_tmp1 += src_stride4;
1116
1117        src4 = __lsx_vld(src_tmp2, 0);
1118        DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2,
1119                  src5, src6);
1120        src7 = __lsx_vldx(src_tmp2, src_stride3);
1121        src_tmp2 += src_stride4;
1122
1123        __lsx_vst(src0, dst_tmp1, 0);
1124        __lsx_vstx(src1, dst_tmp1, dst_stride);
1125        __lsx_vstx(src2, dst_tmp1, dst_stride2);
1126        __lsx_vstx(src3, dst_tmp1, dst_stride3);
1127        dst_tmp1 += dst_stride4;
1128        __lsx_vst(src4, dst_tmp2, 0);
1129        __lsx_vstx(src5, dst_tmp2, dst_stride);
1130        __lsx_vstx(src6, dst_tmp2, dst_stride2);
1131        __lsx_vstx(src7, dst_tmp2, dst_stride3);
1132        dst_tmp2 += dst_stride4;
1133    }
1134}
1135
1136static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
1137                             uint8_t *dst, int32_t dst_stride,
1138                             int32_t height)
1139{
1140    int32_t cnt = height >> 2;
1141    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1142    __m128i src8, src9, src10, src11, src12, src13, src14, src15;
1143
1144    for (;cnt--;) {
1145        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1146                  src0, src1, src2, src3);
1147        src += src_stride;
1148        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1149                  src4, src5, src6, src7);
1150        src += src_stride;
1151        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1152                  src8, src9, src10, src11);
1153        src += src_stride;
1154        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1155                  src12, src13, src14, src15);
1156        src += src_stride;
1157        __lsx_vst(src0, dst, 0);
1158        __lsx_vst(src1, dst, 16);
1159        __lsx_vst(src2, dst, 32);
1160        __lsx_vst(src3, dst, 48);
1161        dst += dst_stride;
1162        __lsx_vst(src4, dst, 0);
1163        __lsx_vst(src5, dst, 16);
1164        __lsx_vst(src6, dst, 32);
1165        __lsx_vst(src7, dst, 48);
1166        dst += dst_stride;
1167        __lsx_vst(src8, dst, 0);
1168        __lsx_vst(src9, dst, 16);
1169        __lsx_vst(src10, dst, 32);
1170        __lsx_vst(src11, dst, 48);
1171        dst += dst_stride;
1172        __lsx_vst(src12, dst, 0);
1173        __lsx_vst(src13, dst, 16);
1174        __lsx_vst(src14, dst, 32);
1175        __lsx_vst(src15, dst, 48);
1176        dst += dst_stride;
1177    }
1178}
1179
1180static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
1181                                              int32_t src_stride,
1182                                              uint8_t *dst, int32_t dst_stride,
1183                                              const int8_t *filter)
1184{
1185    uint8_t *dst_tmp = dst;
1186    __m128i src0, src1, src2, src3;
1187    __m128i filter0, filter1, filter2, filter3;
1188    __m128i mask0, mask1, mask2, mask3;
1189    __m128i tmp0, tmp1;
1190    __m128i dst0, dst1, dst2, dst3;
1191
1192    mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1193    src -= 3;
1194    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1195    mask3 = __lsx_vaddi_bu(mask0, 6);
1196    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1197              filter0, filter1, filter2, filter3);
1198    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1199    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1200              src0, src1, src2, src3);
1201    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1202                               filter0, filter1, filter2, filter3, tmp0, tmp1);
1203    dst0 = __lsx_vldrepl_w(dst_tmp, 0);
1204    dst_tmp += dst_stride;
1205    dst1 = __lsx_vldrepl_w(dst_tmp, 0);
1206    dst_tmp += dst_stride;
1207    dst2 = __lsx_vldrepl_w(dst_tmp, 0);
1208    dst_tmp += dst_stride;
1209    dst3 = __lsx_vldrepl_w(dst_tmp, 0);
1210    dst0 = __lsx_vilvl_w(dst1, dst0);
1211    dst1 = __lsx_vilvl_w(dst3, dst2);
1212    dst0 = __lsx_vilvl_d(dst1, dst0);
1213    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
1214    tmp0 = __lsx_vxori_b(tmp0, 128);
1215    dst0 = __lsx_vavgr_bu(tmp0, dst0);
1216    __lsx_vstelm_w(dst0, dst, 0, 0);
1217    dst += dst_stride;
1218    __lsx_vstelm_w(dst0, dst, 0, 1);
1219    dst += dst_stride;
1220    __lsx_vstelm_w(dst0, dst, 0, 2);
1221    dst += dst_stride;
1222    __lsx_vstelm_w(dst0, dst, 0, 3);
1223}
1224
1225static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
1226                                              int32_t src_stride,
1227                                              uint8_t *dst, int32_t dst_stride,
1228                                              const int8_t *filter)
1229{
1230    uint8_t *dst_tmp = dst;
1231    __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1232    __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
1233    __m128i dst0, dst1;
1234
1235    mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1236    src -= 3;
1237    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1238    mask3 = __lsx_vaddi_bu(mask0, 6);
1239    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1240              filter0, filter1, filter2, filter3);
1241
1242    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1243    src += src_stride;
1244    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1245              src0, src1, src2, src3);
1246    tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
1247    dst_tmp += dst_stride;
1248    tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
1249    dst_tmp += dst_stride;
1250    tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
1251    dst_tmp += dst_stride;
1252    tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
1253    dst_tmp += dst_stride;
1254    tmp0 = __lsx_vilvl_w(tmp1, tmp0);
1255    tmp1 = __lsx_vilvl_w(tmp3, tmp2);
1256    dst0 = __lsx_vilvl_d(tmp1, tmp0);
1257
1258    tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
1259    dst_tmp += dst_stride;
1260    tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
1261    dst_tmp += dst_stride;
1262    tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
1263    dst_tmp += dst_stride;
1264    tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
1265    tmp0 = __lsx_vilvl_w(tmp1, tmp0);
1266    tmp1 = __lsx_vilvl_w(tmp3, tmp2);
1267    dst1 = __lsx_vilvl_d(tmp1, tmp0);
1268    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1269                               filter0, filter1, filter2, filter3, tmp0, tmp1);
1270    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1271    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1272              src0, src1, src2, src3);
1273    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1274                               filter0, filter1, filter2, filter3, tmp2, tmp3);
1275    DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
1276              tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
1277    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1278    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1279    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
1280    __lsx_vstelm_w(dst0, dst, 0, 0);
1281    dst += dst_stride;
1282    __lsx_vstelm_w(dst0, dst, 0, 1);
1283    dst += dst_stride;
1284    __lsx_vstelm_w(dst0, dst, 0, 2);
1285    dst += dst_stride;
1286    __lsx_vstelm_w(dst0, dst, 0, 3);
1287    dst += dst_stride;
1288    __lsx_vstelm_w(dst1, dst, 0, 0);
1289    dst += dst_stride;
1290    __lsx_vstelm_w(dst1, dst, 0, 1);
1291    dst += dst_stride;
1292    __lsx_vstelm_w(dst1, dst, 0, 2);
1293    dst += dst_stride;
1294    __lsx_vstelm_w(dst1, dst, 0, 3);
1295}
1296
1297static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
1298                                             int32_t src_stride,
1299                                             uint8_t *dst, int32_t dst_stride,
1300                                             const int8_t *filter,
1301                                             int32_t height)
1302{
1303    if (height == 4) {
1304        common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
1305    } else if (height == 8) {
1306        common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
1307    }
1308}
1309
1310static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
1311                                             int32_t src_stride,
1312                                             uint8_t *dst, int32_t dst_stride,
1313                                             const int8_t *filter,
1314                                             int32_t height)
1315{
1316    int32_t loop_cnt = height >> 2;
1317    uint8_t *dst_tmp = dst;
1318    __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1319    __m128i mask0, mask1, mask2, mask3;
1320    __m128i tmp0, tmp1, tmp2, tmp3;
1321    __m128i dst0, dst1, dst2, dst3;
1322    int32_t src_stride2 = src_stride << 1;
1323    int32_t src_stride3 = src_stride2 + src_stride;
1324    int32_t src_stride4 = src_stride2 << 1;
1325    uint8_t *_src = (uint8_t*)src - 3;
1326
1327    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1328    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1329    mask3 = __lsx_vaddi_bu(mask0, 6);
1330    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1331              filter0, filter1, filter2, filter3);
1332
1333    for (;loop_cnt--;) {
1334        src0 = __lsx_vld(_src, 0);
1335        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1336        src3 = __lsx_vldx(_src, src_stride3);
1337        _src += src_stride4;
1338        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1339                  src0, src1, src2, src3);
1340        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1341              mask3,filter0, filter1, filter2, filter3, tmp0, tmp1, tmp2, tmp3);
1342        dst0 = __lsx_vldrepl_d(dst_tmp, 0);
1343        dst_tmp += dst_stride;
1344        dst1 = __lsx_vldrepl_d(dst_tmp, 0);
1345        dst_tmp += dst_stride;
1346        dst2 = __lsx_vldrepl_d(dst_tmp, 0);
1347        dst_tmp += dst_stride;
1348        dst3 = __lsx_vldrepl_d(dst_tmp, 0);
1349        dst_tmp += dst_stride;
1350        DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
1351        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
1352        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1353        DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
1354        __lsx_vstelm_d(dst0, dst, 0, 0);
1355        dst += dst_stride;
1356        __lsx_vstelm_d(dst0, dst, 0, 1);
1357        dst += dst_stride;
1358        __lsx_vstelm_d(dst1, dst, 0, 0);
1359        dst += dst_stride;
1360        __lsx_vstelm_d(dst1, dst, 0, 1);
1361        dst += dst_stride;
1362    }
1363}
1364
1365static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
1366                                              int32_t src_stride,
1367                                              uint8_t *dst, int32_t dst_stride,
1368                                              const int8_t *filter,
1369                                              int32_t height)
1370{
1371    int32_t loop_cnt = height >> 1;
1372    int32_t dst_stride2 = dst_stride << 1;
1373    uint8_t *dst_tmp = dst;
1374    __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1375    __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
1376    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1377    __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1378
1379    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1380    src -= 3;
1381    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1382    mask3 = __lsx_vaddi_bu(mask0, 6);
1383    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1384              filter0, filter1, filter2, filter3);
1385
1386    for (;loop_cnt--;) {
1387        DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
1388        src += src_stride;
1389        DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
1390        src += src_stride;
1391        dst0 = __lsx_vld(dst_tmp, 0);
1392        dst1 = __lsx_vldx(dst_tmp, dst_stride);
1393        dst_tmp += dst_stride2;
1394        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1395                  src0, src1, src2, src3);
1396        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
1397                  mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
1398        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
1399                  mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
1400        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
1401                  mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
1402        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
1403                  mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
1404        DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
1405                  filter0, tmp0, tmp1, tmp2, tmp3);
1406        DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, tmp11,
1407                  filter2, tmp8, tmp9, tmp10, tmp11);
1408        DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
1409                  tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
1410        DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, tmp10,
1411                  tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7);
1412        DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
1413                  tmp0, tmp1, tmp2, tmp3);
1414        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
1415        DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
1416        DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
1417        __lsx_vst(dst0, dst, 0);
1418        __lsx_vstx(dst1, dst, dst_stride);
1419        dst += dst_stride2;
1420    }
1421}
1422
1423static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
1424                                              int32_t src_stride,
1425                                              uint8_t *dst, int32_t dst_stride,
1426                                              const int8_t *filter,
1427                                              int32_t height)
1428{
1429    uint32_t loop_cnt = height;
1430    uint8_t *dst_tmp = dst;
1431    __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1432    __m128i mask0, mask1, mask2, mask3, dst0, dst1;
1433    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1434    __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1435    __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1436
1437    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1438    src -= 3;
1439    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1440    mask3 = __lsx_vaddi_bu(mask0, 6);
1441    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1442                  filter0, filter1, filter2, filter3);
1443
1444    for (;loop_cnt--;) {
1445        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
1446        src3 = __lsx_vld(src, 24);
1447        src1 = __lsx_vshuf_b(src2, src0, shuff);
1448        src += src_stride;
1449        DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
1450        dst_tmp += dst_stride;
1451        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1452                  src0, src1, src2, src3);
1453        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2,
1454                  src2, mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
1455        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2,
1456                  src2, mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
1457        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2,
1458                  src2, mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
1459        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2,
1460                  src2, mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
1461        DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0,
1462                  tmp3, filter0, tmp0, tmp1, tmp2, tmp3);
1463        DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
1464                  tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
1465        DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1,
1466             tmp2, tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
1467        DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
1468        tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7);
1469        DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
1470                  tmp0, tmp1, tmp2, tmp3);
1471        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
1472        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1473        DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
1474        __lsx_vst(dst0, dst, 0);
1475        __lsx_vst(dst1, dst, 16);
1476        dst += dst_stride;
1477    }
1478}
1479
1480static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
1481                                              int32_t src_stride,
1482                                              uint8_t *dst, int32_t dst_stride,
1483                                              const int8_t *filter,
1484                                              int32_t height)
1485{
1486    int32_t loop_cnt = height;
1487    __m128i src0, src1, src2, src3;
1488    __m128i filter0, filter1, filter2, filter3;
1489    __m128i mask0, mask1, mask2, mask3;
1490    __m128i out0, out1, out2, out3, dst0, dst1;
1491    __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1492
1493    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1494    src -= 3;
1495    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1496    mask3 = __lsx_vaddi_bu(mask0, 6);
1497    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1498                  filter0, filter1, filter2, filter3);
1499
1500    for (;loop_cnt--;) {
1501        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
1502        src3 = __lsx_vld(src, 24);
1503        src1 = __lsx_vshuf_b(src2, src0, shuff);
1504        DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
1505        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1506                  src0, src1, src2, src3);
1507        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1508             mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
1509        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1510        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1511        DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
1512        __lsx_vst(out0, dst, 0);
1513        __lsx_vst(out1, dst, 16);
1514
1515        DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
1516        src3 = __lsx_vld(src, 56);
1517        src1 = __lsx_vshuf_b(src2, src0, shuff);
1518        DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
1519        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1520                  src0, src1, src2, src3);
1521        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1522             mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
1523        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1524        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1525        DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
1526        __lsx_vst(out0, dst, 32);
1527        __lsx_vst(out1, dst, 48);
1528        src += src_stride;
1529        dst += dst_stride;
1530    }
1531}
1532
1533static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
1534                                             int32_t src_stride,
1535                                             uint8_t *dst, int32_t dst_stride,
1536                                             const int8_t *filter,
1537                                             int32_t height)
1538{
1539    uint32_t loop_cnt = height >> 2;
1540    uint8_t *dst_tmp = dst;
1541    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1542    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1543    __m128i reg0, reg1, reg2, reg3, reg4;
1544    __m128i filter0, filter1, filter2, filter3;
1545    __m128i out0, out1;
1546    int32_t src_stride2 = src_stride << 1;
1547    int32_t src_stride3 = src_stride + src_stride2;
1548    int32_t src_stride4 = src_stride2 << 1;
1549    uint8_t* _src = (uint8_t*)src - src_stride3;
1550
1551    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1552              filter0, filter1, filter2, filter3);
1553    src0 = __lsx_vld(_src, 0);
1554    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1555    src3 = __lsx_vldx(_src, src_stride3);
1556    _src += src_stride4;
1557    src4 = __lsx_vld(_src, 0);
1558    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1559    _src += src_stride3;
1560    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1561              tmp0, tmp1, tmp2, tmp3);
1562    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
1563    DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
1564    reg2 = __lsx_vilvl_d(tmp5, tmp2);
1565    DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
1566    reg2 = __lsx_vxori_b(reg2, 128);
1567
1568    for (;loop_cnt--;) {
1569        src7 = __lsx_vld(_src, 0);
1570        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1571        src10 = __lsx_vldx(_src, src_stride3);
1572        _src += src_stride4;
1573        src0 = __lsx_vldrepl_w(dst_tmp, 0);
1574        dst_tmp += dst_stride;
1575        src1 = __lsx_vldrepl_w(dst_tmp, 0);
1576        dst_tmp += dst_stride;
1577        src2 = __lsx_vldrepl_w(dst_tmp, 0);
1578        dst_tmp += dst_stride;
1579        src3 = __lsx_vldrepl_w(dst_tmp, 0);
1580        dst_tmp += dst_stride;
1581        DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
1582        src0 = __lsx_vilvl_d(src1, src0);
1583        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1584                  src9, tmp0, tmp1, tmp2, tmp3);
1585        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
1586        DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
1587        out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0,
1588                                   filter1, filter2, filter3);
1589        out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0,
1590                                   filter1, filter2, filter3);
1591        out0 = __lsx_vssrarni_b_h(out1, out0, 7);
1592        out0 = __lsx_vxori_b(out0, 128);
1593        out0 = __lsx_vavgr_bu(out0, src0);
1594        __lsx_vstelm_w(out0, dst, 0, 0);
1595        dst += dst_stride;
1596        __lsx_vstelm_w(out0, dst, 0, 1);
1597        dst += dst_stride;
1598        __lsx_vstelm_w(out0, dst, 0, 2);
1599        dst += dst_stride;
1600        __lsx_vstelm_w(out0, dst, 0, 3);
1601        dst += dst_stride;
1602        reg0 = reg2;
1603        reg1 = reg3;
1604        reg2 = reg4;
1605        src6 = src10;
1606    }
1607}
1608
1609static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
1610                                             int32_t src_stride,
1611                                             uint8_t *dst, int32_t dst_stride,
1612                                             const int8_t *filter,
1613                                             int32_t height)
1614{
1615    uint32_t loop_cnt = height >> 2;
1616    uint8_t *dst_tmp = dst;
1617    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1618    __m128i tmp0, tmp1, tmp2, tmp3;
1619    __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1620    __m128i filter0, filter1, filter2, filter3;
1621    __m128i out0, out1, out2, out3;
1622    int32_t src_stride2 = src_stride << 1;
1623    int32_t src_stride3 = src_stride + src_stride2;
1624    int32_t src_stride4 = src_stride2 << 1;
1625    uint8_t* _src = (uint8_t*)src - src_stride3;
1626
1627    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1628              filter0, filter1, filter2, filter3);
1629
1630    src0 = __lsx_vld(_src, 0);
1631    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1632    src3 = __lsx_vldx(_src, src_stride3);
1633    _src += src_stride4;
1634    src4 = __lsx_vld(_src, 0);
1635    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1636    _src += src_stride3;
1637    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1638              src0, src1, src2, src3);
1639    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1640    src6 = __lsx_vxori_b(src6, 128);
1641    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2,
1642              src1, reg0, reg1, reg2, reg3);
1643    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
1644
1645    for (;loop_cnt--;) {
1646        src7 = __lsx_vld(_src, 0);
1647        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1648        src10 = __lsx_vldx(_src, src_stride3);
1649        _src += src_stride4;
1650        src0 = __lsx_vldrepl_d(dst_tmp, 0);
1651        dst_tmp += dst_stride;
1652        src1 = __lsx_vldrepl_d(dst_tmp, 0);
1653        dst_tmp += dst_stride;
1654        src2 = __lsx_vldrepl_d(dst_tmp, 0);
1655        dst_tmp += dst_stride;
1656        src3 = __lsx_vldrepl_d(dst_tmp, 0);
1657        dst_tmp += dst_stride;
1658        DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
1659        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
1660                  src7, src8, src9, src10);
1661        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1662                  src9, tmp0, tmp1, tmp2, tmp3);
1663        out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0,
1664                                   filter1, filter2, filter3);
1665        out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0,
1666                                   filter1, filter2, filter3);
1667        out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0,
1668                                   filter1, filter2, filter3);
1669        out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0,
1670                                   filter1, filter2, filter3);
1671        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1672        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1673        DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
1674        __lsx_vstelm_d(out0, dst, 0, 0);
1675        dst += dst_stride;
1676        __lsx_vstelm_d(out0, dst, 0, 1);
1677        dst += dst_stride;
1678        __lsx_vstelm_d(out1, dst, 0, 0);
1679        dst += dst_stride;
1680        __lsx_vstelm_d(out1, dst, 0, 1);
1681        dst += dst_stride;
1682
1683        reg0 = reg2;
1684        reg1 = tmp0;
1685        reg2 = tmp2;
1686        reg3 = reg5;
1687        reg4 = tmp1;
1688        reg5 = tmp3;
1689        src6 = src10;
1690    }
1691}
1692
1693static void common_vt_8t_and_aver_dst_16w_mult_lsx(const uint8_t *src,
1694                                                   int32_t src_stride,
1695                                                   uint8_t *dst,
1696                                                   int32_t dst_stride,
1697                                                   const int8_t *filter,
1698                                                   int32_t height,
1699                                                   int32_t width)
1700{
1701    uint8_t *src_tmp;
1702    uint32_t cnt = width >> 4;
1703    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1704    __m128i filter0, filter1, filter2, filter3;
1705    __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1706    __m128i reg6, reg7, reg8, reg9, reg10, reg11;
1707    __m128i tmp0, tmp1, tmp2, tmp3;
1708    int32_t src_stride2 = src_stride << 1;
1709    int32_t src_stride3 = src_stride + src_stride2;
1710    int32_t src_stride4 = src_stride2 << 1;
1711    int32_t dst_stride2 = dst_stride << 1;
1712    int32_t dst_stride3 = dst_stride2 + dst_stride;
1713    int32_t dst_stride4 = dst_stride2 << 1;
1714    uint8_t *_src = (uint8_t*)src - src_stride3;
1715
1716    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1717              filter0, filter1, filter2, filter3);
1718    for (;cnt--;) {
1719        uint32_t loop_cnt = height >> 2;
1720        uint8_t *dst_reg = dst;
1721
1722        src_tmp = _src;
1723        src0 = __lsx_vld(src_tmp, 0);
1724        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1725                  src1, src2);
1726        src3 = __lsx_vldx(src_tmp, src_stride3);
1727        src_tmp += src_stride4;
1728        src4 = __lsx_vld(src_tmp, 0);
1729        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1730                  src5, src6);
1731        src_tmp += src_stride3;
1732        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1733                  src0, src1, src2, src3);
1734        DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1735        src6 = __lsx_vxori_b(src6, 128);
1736        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1737                  reg0, reg1, reg2, reg3);
1738        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
1739        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
1740                  reg6, reg7, reg8, reg9);
1741        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
1742
1743        for (;loop_cnt--;) {
1744            src7 = __lsx_vld(src_tmp, 0);
1745            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1746                      src8, src9);
1747            src10 = __lsx_vldx(src_tmp, src_stride3);
1748            src_tmp += src_stride4;
1749            DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
1750                      128, src7, src8, src9, src10);
1751            DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
1752                      src10, src9, src0, src1, src2, src3);
1753            DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
1754                      src10, src9, src4, src5, src7, src8);
1755            tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0,
1756                                       filter1, filter2, filter3);
1757            tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0,
1758                                       filter1, filter2, filter3);
1759            tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0,
1760                                       filter1, filter2, filter3);
1761            tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0,
1762                                       filter1, filter2, filter3);
1763            DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
1764                      tmp0, tmp1);
1765            DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1766            tmp2 = __lsx_vld(dst_reg, 0);
1767            tmp3 = __lsx_vldx(dst_reg, dst_stride);
1768            DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1769            __lsx_vst(tmp0, dst_reg, 0);
1770            __lsx_vstx(tmp1, dst_reg, dst_stride);
1771            tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0,
1772                                       filter1, filter2, filter3);
1773            tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0,
1774                                       filter1, filter2, filter3);
1775            tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0,
1776                                       filter1, filter2, filter3);
1777            tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0,
1778                                       filter1, filter2, filter3);
1779            DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
1780                      tmp0, tmp1);
1781            DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1782            tmp2 = __lsx_vldx(dst_reg, dst_stride2);
1783            tmp3 = __lsx_vldx(dst_reg, dst_stride3);
1784            DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1785            __lsx_vstx(tmp0, dst_reg, dst_stride2);
1786            __lsx_vstx(tmp1, dst_reg, dst_stride3);
1787            dst_reg += dst_stride4;
1788
1789            reg0 = reg2;
1790            reg1 = src0;
1791            reg2 = src2;
1792            reg3 = reg5;
1793            reg4 = src1;
1794            reg5 = src3;
1795            reg6 = reg8;
1796            reg7 = src4;
1797            reg8 = src7;
1798            reg9 = reg11;
1799            reg10 = src5;
1800            reg11 = src8;
1801            src6 = src10;
1802        }
1803        _src += 16;
1804        dst  += 16;
1805    }
1806}
1807
1808static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
1809                                              int32_t src_stride,
1810                                              uint8_t *dst, int32_t dst_stride,
1811                                              const int8_t *filter,
1812                                              int32_t height)
1813{
1814    common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1815                                           filter, height, 16);
1816}
1817
1818static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
1819                                              int32_t src_stride,
1820                                              uint8_t *dst, int32_t dst_stride,
1821                                              const int8_t *filter,
1822                                              int32_t height)
1823{
1824    common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1825                                           filter, height, 32);
1826}
1827
1828static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
1829                                              int32_t src_stride,
1830                                              uint8_t *dst, int32_t dst_stride,
1831                                              const int8_t *filter,
1832                                              int32_t height)
1833{
1834    common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1835                                           filter, height, 64);
1836}
1837
1838static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(const uint8_t *src,
1839                                                  int32_t src_stride,
1840                                                  uint8_t *dst,
1841                                                  int32_t dst_stride,
1842                                                  const int8_t *filter_horiz,
1843                                                  const int8_t *filter_vert,
1844                                                  int32_t height)
1845{
1846    uint32_t loop_cnt = height >> 2;
1847    uint8_t *dst_tmp = dst;
1848    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1849    __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1850    __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1851    __m128i mask0, mask1, mask2, mask3;
1852    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1853    __m128i out0, out1;
1854    __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1855    int32_t src_stride2 = src_stride << 1;
1856    int32_t src_stride3 = src_stride + src_stride2;
1857    int32_t src_stride4 = src_stride2 << 1;
1858    uint8_t* _src = (uint8_t*)src - 3 - src_stride3;
1859
1860    mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1861    DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
1862              4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1863    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1864    mask3 = __lsx_vaddi_bu(mask0, 6);
1865
1866    src0 = __lsx_vld(_src, 0);
1867    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1868    src3 = __lsx_vldx(_src, src_stride3);
1869    _src += src_stride4;
1870    src4 = __lsx_vld(_src, 0);
1871    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1872    _src += src_stride3;
1873
1874    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1875              src0, src1, src2, src3);
1876    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1877    src6 = __lsx_vxori_b(src6, 128);
1878
1879    tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1880                           filt_hz1, filt_hz2, filt_hz3);
1881    tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1882                           filt_hz1, filt_hz2, filt_hz3);
1883    tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1884                           filt_hz1, filt_hz2, filt_hz3);
1885    tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1886                           filt_hz1, filt_hz2, filt_hz3);
1887    DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
1888    DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
1889              filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1890    DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1891    tmp2 = __lsx_vpackev_b(tmp5, tmp4);
1892
1893    for (;loop_cnt--;) {
1894        src7 = __lsx_vld(_src, 0);
1895        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1896        src10 = __lsx_vldx(_src, src_stride3);
1897        _src += src_stride4;
1898        src2 = __lsx_vldrepl_w(dst_tmp, 0);
1899        dst_tmp += dst_stride;
1900        src3 = __lsx_vldrepl_w(dst_tmp, 0);
1901        dst_tmp += dst_stride;
1902        src4 = __lsx_vldrepl_w(dst_tmp, 0);
1903        dst_tmp += dst_stride;
1904        src5 = __lsx_vldrepl_w(dst_tmp, 0);
1905        dst_tmp += dst_stride;
1906        DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
1907        src2 = __lsx_vilvl_d(src3, src2);
1908        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
1909                  src7, src8, src9, src10);
1910        tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
1911                               filt_hz1, filt_hz2, filt_hz3);
1912        tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
1913        tmp4 = __lsx_vpackev_b(tmp3, tmp4);
1914        out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
1915                                   filt_vt2, filt_vt3);
1916        src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1917                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1918        src0 = __lsx_vshuf_b(src1, tmp3, shuff);
1919        src0 = __lsx_vpackev_b(src1, src0);
1920        out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
1921                                   filt_vt2, filt_vt3);
1922        out0 = __lsx_vssrarni_b_h(out1, out0, 7);
1923        out0 = __lsx_vxori_b(out0, 128);
1924        out0 = __lsx_vavgr_bu(out0, src2);
1925        __lsx_vstelm_w(out0, dst, 0, 0);
1926        dst += dst_stride;
1927        __lsx_vstelm_w(out0, dst, 0, 1);
1928        dst += dst_stride;
1929        __lsx_vstelm_w(out0, dst, 0, 2);
1930        dst += dst_stride;
1931        __lsx_vstelm_w(out0, dst, 0, 3);
1932        dst += dst_stride;
1933
1934        tmp5 = src1;
1935        tmp0 = tmp2;
1936        tmp1 = tmp4;
1937        tmp2 = src0;
1938    }
1939}
1940
1941static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(const uint8_t *src,
1942                                                  int32_t src_stride,
1943                                                  uint8_t *dst,
1944                                                  int32_t dst_stride,
1945                                                  const int8_t *filter_horiz,
1946                                                  const int8_t *filter_vert,
1947                                                  int32_t height)
1948{
1949    uint32_t loop_cnt = height >> 2;
1950    uint8_t *dst_tmp = dst;
1951    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1952    __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1953    __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1954    __m128i mask0, mask1, mask2, mask3;
1955    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1956    __m128i out0, out1;
1957    int32_t src_stride2 = src_stride << 1;
1958    int32_t src_stride3 = src_stride + src_stride2;
1959    int32_t src_stride4 = src_stride2 << 1;
1960    uint8_t* _src = (uint8_t*)src - 3 - src_stride3;
1961
1962    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1963    DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
1964              4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1965    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1966    mask3 = __lsx_vaddi_bu(mask0, 6);
1967
1968    src0 = __lsx_vld(_src, 0);
1969    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1970    src3 = __lsx_vldx(_src, src_stride3);
1971    _src += src_stride4;
1972    src4 = __lsx_vld(_src, 0);
1973    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1974    _src += src_stride3;
1975    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1976              src0, src1, src2, src3);
1977    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1978    src6 = __lsx_vxori_b(src6, 128);
1979
1980    src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1981                           filt_hz1, filt_hz2, filt_hz3);
1982    src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1983                           filt_hz1, filt_hz2, filt_hz3);
1984    src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1985                           filt_hz1, filt_hz2, filt_hz3);
1986    src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1987                           filt_hz1, filt_hz2, filt_hz3);
1988    src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1989                           filt_hz1, filt_hz2, filt_hz3);
1990    src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1991                           filt_hz1, filt_hz2, filt_hz3);
1992    src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1993                           filt_hz1, filt_hz2, filt_hz3);
1994
1995    DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
1996              filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1997    DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4,
1998              src2, src1, tmp0, tmp1, tmp2, tmp4);
1999    DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
2000
2001    for (;loop_cnt--;) {
2002        src7 = __lsx_vld(_src, 0);
2003        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
2004        src10 = __lsx_vldx(_src, src_stride3);
2005        _src += src_stride4;
2006
2007        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
2008                  src7, src8, src9, src10);
2009        src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
2010                               filt_hz1, filt_hz2, filt_hz3);
2011        tmp3 = __lsx_vpackev_b(src7, src6);
2012        out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
2013                                   filt_vt2, filt_vt3);
2014        src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
2015                               filt_hz1, filt_hz2, filt_hz3);
2016        src0 = __lsx_vpackev_b(src8, src7);
2017        out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
2018                                   filt_vt2, filt_vt3);
2019        src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
2020                               filt_hz1, filt_hz2, filt_hz3);
2021        src1 = __lsx_vpackev_b(src9, src8);
2022        src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
2023                                   filt_vt2, filt_vt3);
2024        src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
2025                               filt_hz1, filt_hz2, filt_hz3);
2026        src2 = __lsx_vpackev_b(src10, src9);
2027        src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
2028                                   filt_vt2, filt_vt3);
2029        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
2030        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
2031        src5 = __lsx_vldrepl_d(dst_tmp, 0);
2032        dst_tmp += dst_stride;
2033        src7 = __lsx_vldrepl_d(dst_tmp, 0);
2034        dst_tmp += dst_stride;
2035        src8 = __lsx_vldrepl_d(dst_tmp, 0);
2036        dst_tmp += dst_stride;
2037        src9 = __lsx_vldrepl_d(dst_tmp, 0);
2038        dst_tmp += dst_stride;
2039        DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
2040        DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
2041        __lsx_vstelm_d(out0, dst, 0, 0);
2042        dst += dst_stride;
2043        __lsx_vstelm_d(out0, dst, 0, 1);
2044        dst += dst_stride;
2045        __lsx_vstelm_d(out1, dst, 0, 0);
2046        dst += dst_stride;
2047        __lsx_vstelm_d(out1, dst, 0, 1);
2048        dst += dst_stride;
2049
2050        src6 = src10;
2051        tmp0 = tmp2;
2052        tmp1 = tmp3;
2053        tmp2 = src1;
2054        tmp4 = tmp6;
2055        tmp5 = src0;
2056        tmp6 = src2;
2057    }
2058}
2059
2060static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(const uint8_t *src,
2061                                                   int32_t src_stride,
2062                                                   uint8_t *dst,
2063                                                   int32_t dst_stride,
2064                                                   const int8_t *filter_horiz,
2065                                                   const int8_t *filter_vert,
2066                                                   int32_t height)
2067{
2068    int32_t multiple8_cnt;
2069
2070    for (multiple8_cnt = 2; multiple8_cnt--;) {
2071        common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2072                                              filter_horiz, filter_vert,
2073                                              height);
2074
2075        src += 8;
2076        dst += 8;
2077    }
2078}
2079
2080static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(const uint8_t *src,
2081                                                   int32_t src_stride,
2082                                                   uint8_t *dst,
2083                                                   int32_t dst_stride,
2084                                                   const int8_t *filter_horiz,
2085                                                   const int8_t *filter_vert,
2086                                                   int32_t height)
2087{
2088    int32_t multiple8_cnt;
2089
2090    for (multiple8_cnt = 4; multiple8_cnt--;) {
2091        common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2092                                              filter_horiz, filter_vert,
2093                                              height);
2094
2095        src += 8;
2096        dst += 8;
2097    }
2098}
2099
2100static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(const uint8_t *src,
2101                                                   int32_t src_stride,
2102                                                   uint8_t *dst,
2103                                                   int32_t dst_stride,
2104                                                   const int8_t *filter_horiz,
2105                                                   const int8_t *filter_vert,
2106                                                   int32_t height)
2107{
2108    int32_t multiple8_cnt;
2109
2110    for (multiple8_cnt = 8; multiple8_cnt--;) {
2111        common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2112                                              filter_horiz, filter_vert,
2113                                              height);
2114
2115        src += 8;
2116        dst += 8;
2117    }
2118}
2119
2120static void avg_width8_lsx(const uint8_t *src, int32_t src_stride,
2121                           uint8_t *dst, int32_t dst_stride,
2122                           int32_t height)
2123{
2124    int32_t cnt = height >> 2;
2125    uint8_t *dst_tmp = dst;
2126    __m128i src0, src1, dst0, dst1;
2127    __m128i tmp0, tmp1, tmp2, tmp3;
2128
2129    for (;cnt--;) {
2130        tmp0 = __lsx_vldrepl_d(src, 0);
2131        src += src_stride;
2132        tmp1 = __lsx_vldrepl_d(src, 0);
2133        src += src_stride;
2134        tmp2 = __lsx_vldrepl_d(src, 0);
2135        src += src_stride;
2136        tmp3 = __lsx_vldrepl_d(src, 0);
2137        src += src_stride;
2138        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, src0, src1);
2139        tmp0 = __lsx_vldrepl_d(dst_tmp, 0);
2140        dst_tmp += dst_stride;
2141        tmp1 = __lsx_vldrepl_d(dst_tmp, 0);
2142        dst_tmp += dst_stride;
2143        tmp2 = __lsx_vldrepl_d(dst_tmp, 0);
2144        dst_tmp += dst_stride;
2145        tmp3 = __lsx_vldrepl_d(dst_tmp, 0);
2146        dst_tmp += dst_stride;
2147        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2148        DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
2149        __lsx_vstelm_d(dst0, dst, 0, 0);
2150        dst += dst_stride;
2151        __lsx_vstelm_d(dst0, dst, 0, 1);
2152        dst += dst_stride;
2153        __lsx_vstelm_d(dst1, dst, 0, 0);
2154        dst += dst_stride;
2155        __lsx_vstelm_d(dst1, dst, 0, 1);
2156        dst += dst_stride;
2157    }
2158}
2159
2160static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
2161                            uint8_t *dst, int32_t dst_stride,
2162                            int32_t height)
2163{
2164    int32_t cnt = height >> 2;
2165    __m128i src0, src1, src2, src3;
2166    __m128i dst0, dst1, dst2, dst3;
2167    int32_t src_stride2 = src_stride << 1;
2168    int32_t src_stride3 = src_stride + src_stride2;
2169    int32_t src_stride4 = src_stride2 << 1;
2170    int32_t dst_stride2 = dst_stride << 1;
2171    int32_t dst_stride3 = dst_stride2 + dst_stride;
2172    int32_t dst_stride4 = dst_stride2 << 1;
2173    uint8_t* _src = (uint8_t*)src;
2174
2175    for (;cnt--;) {
2176        src0 = __lsx_vld(_src, 0);
2177        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
2178        src3 = __lsx_vldx(_src, src_stride3);
2179        _src += src_stride4;
2180
2181        dst0 = __lsx_vld(dst, 0);
2182        DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2,
2183                  dst1, dst2);
2184        dst3 = __lsx_vldx(dst, dst_stride3);
2185        DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2186                  src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2187        __lsx_vst(dst0, dst, 0);
2188        __lsx_vstx(dst1, dst, dst_stride);
2189        __lsx_vstx(dst2, dst, dst_stride2);
2190        __lsx_vstx(dst3, dst, dst_stride3);
2191        dst += dst_stride4;
2192    }
2193}
2194
2195static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
2196                            uint8_t *dst, int32_t dst_stride,
2197                            int32_t height)
2198{
2199    int32_t cnt = height >> 2;
2200    uint8_t *src_tmp1 = (uint8_t*)src;
2201    uint8_t *src_tmp2 = src_tmp1 + 16;
2202    uint8_t *dst_tmp1, *dst_tmp2;
2203    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
2204    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2205    int32_t src_stride2 = src_stride << 1;
2206    int32_t src_stride3 = src_stride + src_stride2;
2207    int32_t src_stride4 = src_stride2 << 1;
2208    int32_t dst_stride2 = dst_stride << 1;
2209    int32_t dst_stride3 = dst_stride2 + dst_stride;
2210    int32_t dst_stride4 = dst_stride2 << 1;
2211
2212    dst_tmp1 = dst;
2213    dst_tmp2 = dst + 16;
2214    for (;cnt--;) {
2215        src0 = __lsx_vld(src_tmp1, 0);
2216        DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2,
2217                  src2, src4);
2218        src6 = __lsx_vldx(src_tmp1, src_stride3);
2219        src_tmp1 += src_stride4;
2220
2221        src1 = __lsx_vld(src_tmp2, 0);
2222        DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2,
2223                  src3, src5);
2224        src7 = __lsx_vldx(src_tmp2, src_stride3);
2225        src_tmp2 += src_stride4;
2226
2227        dst0 = __lsx_vld(dst_tmp1, 0);
2228        DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2,
2229                  dst2, dst4);
2230        dst6 = __lsx_vldx(dst_tmp1, dst_stride3);
2231        dst1 = __lsx_vld(dst_tmp2, 0);
2232        DUP2_ARG2(__lsx_vldx, dst_tmp2, dst_stride, dst_tmp2, dst_stride2,
2233                  dst3, dst5);
2234        dst7 = __lsx_vldx(dst_tmp2, dst_stride3);
2235
2236        DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2237                  src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2238        DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5,
2239                  src6, dst6, src7, dst7, dst4, dst5, dst6, dst7);
2240        __lsx_vst(dst0, dst_tmp1, 0);
2241        __lsx_vstx(dst2, dst_tmp1, dst_stride);
2242        __lsx_vstx(dst4, dst_tmp1, dst_stride2);
2243        __lsx_vstx(dst6, dst_tmp1, dst_stride3);
2244        dst_tmp1 += dst_stride4;
2245        __lsx_vst(dst1, dst_tmp2, 0);
2246        __lsx_vstx(dst3, dst_tmp2, dst_stride);
2247        __lsx_vstx(dst5, dst_tmp2, dst_stride2);
2248        __lsx_vstx(dst7, dst_tmp2, dst_stride3);
2249        dst_tmp2 += dst_stride4;
2250    }
2251}
2252
2253static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
2254                            uint8_t *dst, int32_t dst_stride,
2255                            int32_t height)
2256{
2257    int32_t cnt = height >> 2;
2258    uint8_t *dst_tmp = dst;
2259    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
2260    __m128i src8, src9, src10, src11, src12, src13, src14, src15;
2261    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2262    __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
2263
2264    for (;cnt--;) {
2265        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2266                  src0, src1, src2, src3);
2267        src += src_stride;
2268        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2269                  src4, src5, src6, src7);
2270        src += src_stride;
2271        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2272                  src8, src9, src10, src11);
2273        src += src_stride;
2274        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2275                  src12, src13, src14, src15);
2276        src += src_stride;
2277        DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2278                  dst0, dst1, dst2, dst3);
2279        dst_tmp += dst_stride;
2280        DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2281                  dst4, dst5, dst6, dst7);
2282        dst_tmp += dst_stride;
2283        DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2284                  dst8, dst9, dst10, dst11);
2285        dst_tmp += dst_stride;
2286        DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2287                  dst12, dst13, dst14, dst15);
2288        dst_tmp += dst_stride;
2289        DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2290                  src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2291        DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5,
2292                  src6, dst6, src7, dst7, dst4, dst5, dst6, dst7);
2293        DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10,
2294                  dst10, src11, dst11, dst8, dst9, dst10, dst11);
2295        DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14,
2296                  dst14, src15, dst15, dst12, dst13, dst14, dst15);
2297        __lsx_vst(dst0, dst, 0);
2298        __lsx_vst(dst1, dst, 16);
2299        __lsx_vst(dst2, dst, 32);
2300        __lsx_vst(dst3, dst, 48);
2301        dst += dst_stride;
2302        __lsx_vst(dst4, dst, 0);
2303        __lsx_vst(dst5, dst, 16);
2304        __lsx_vst(dst6, dst, 32);
2305        __lsx_vst(dst7, dst, 48);
2306        dst += dst_stride;
2307        __lsx_vst(dst8, dst, 0);
2308        __lsx_vst(dst9, dst, 16);
2309        __lsx_vst(dst10, dst, 32);
2310        __lsx_vst(dst11, dst, 48);
2311        dst += dst_stride;
2312        __lsx_vst(dst12, dst, 0);
2313        __lsx_vst(dst13, dst, 16);
2314        __lsx_vst(dst14, dst, 32);
2315        __lsx_vst(dst15, dst, 48);
2316        dst += dst_stride;
2317    }
2318}
2319
2320static const int8_t vp9_subpel_filters_lsx[3][15][8] = {
2321    [FILTER_8TAP_REGULAR] = {
2322         {0, 1, -5, 126, 8, -3, 1, 0},
2323         {-1, 3, -10, 122, 18, -6, 2, 0},
2324         {-1, 4, -13, 118, 27, -9, 3, -1},
2325         {-1, 4, -16, 112, 37, -11, 4, -1},
2326         {-1, 5, -18, 105, 48, -14, 4, -1},
2327         {-1, 5, -19, 97, 58, -16, 5, -1},
2328         {-1, 6, -19, 88, 68, -18, 5, -1},
2329         {-1, 6, -19, 78, 78, -19, 6, -1},
2330         {-1, 5, -18, 68, 88, -19, 6, -1},
2331         {-1, 5, -16, 58, 97, -19, 5, -1},
2332         {-1, 4, -14, 48, 105, -18, 5, -1},
2333         {-1, 4, -11, 37, 112, -16, 4, -1},
2334         {-1, 3, -9, 27, 118, -13, 4, -1},
2335         {0, 2, -6, 18, 122, -10, 3, -1},
2336         {0, 1, -3, 8, 126, -5, 1, 0},
2337    }, [FILTER_8TAP_SHARP] = {
2338        {-1, 3, -7, 127, 8, -3, 1, 0},
2339        {-2, 5, -13, 125, 17, -6, 3, -1},
2340        {-3, 7, -17, 121, 27, -10, 5, -2},
2341        {-4, 9, -20, 115, 37, -13, 6, -2},
2342        {-4, 10, -23, 108, 48, -16, 8, -3},
2343        {-4, 10, -24, 100, 59, -19, 9, -3},
2344        {-4, 11, -24, 90, 70, -21, 10, -4},
2345        {-4, 11, -23, 80, 80, -23, 11, -4},
2346        {-4, 10, -21, 70, 90, -24, 11, -4},
2347        {-3, 9, -19, 59, 100, -24, 10, -4},
2348        {-3, 8, -16, 48, 108, -23, 10, -4},
2349        {-2, 6, -13, 37, 115, -20, 9, -4},
2350        {-2, 5, -10, 27, 121, -17, 7, -3},
2351        {-1, 3, -6, 17, 125, -13, 5, -2},
2352        {0, 1, -3, 8, 127, -7, 3, -1},
2353    }, [FILTER_8TAP_SMOOTH] = {
2354        {-3, -1, 32, 64, 38, 1, -3, 0},
2355        {-2, -2, 29, 63, 41, 2, -3, 0},
2356        {-2, -2, 26, 63, 43, 4, -4, 0},
2357        {-2, -3, 24, 62, 46, 5, -4, 0},
2358        {-2, -3, 21, 60, 49, 7, -4, 0},
2359        {-1, -4, 18, 59, 51, 9, -4, 0},
2360        {-1, -4, 16, 57, 53, 12, -4, -1},
2361        {-1, -4, 14, 55, 55, 14, -4, -1},
2362        {-1, -4, 12, 53, 57, 16, -4, -1},
2363        {0, -4, 9, 51, 59, 18, -4, -1},
2364        {0, -4, 7, 49, 60, 21, -3, -2},
2365        {0, -4, 5, 46, 62, 24, -3, -2},
2366        {0, -4, 4, 43, 63, 26, -2, -2},
2367        {0, -3, 2, 41, 63, 29, -2, -2},
2368        {0, -3, 1, 38, 64, 32, -1, -3},
2369    }
2370};
2371
2372#define VP9_8TAP_LOONGARCH_LSX_FUNC(SIZE, type, type_idx)                      \
2373void ff_put_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride,     \
2374                                        const uint8_t *src,                    \
2375                                        ptrdiff_t srcstride,                   \
2376                                        int h, int mx, int my)                 \
2377{                                                                              \
2378    const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1];             \
2379                                                                               \
2380    common_hz_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h);     \
2381}                                                                              \
2382                                                                               \
2383void ff_put_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride,     \
2384                                        const uint8_t *src,                    \
2385                                        ptrdiff_t srcstride,                   \
2386                                        int h, int mx, int my)                 \
2387{                                                                              \
2388    const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1];             \
2389                                                                               \
2390    common_vt_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h);     \
2391}                                                                              \
2392                                                                               \
2393void ff_put_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride,    \
2394                                         const uint8_t *src,                   \
2395                                         ptrdiff_t srcstride,                  \
2396                                         int h, int mx, int my)                \
2397{                                                                              \
2398    const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1];            \
2399    const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1];            \
2400                                                                               \
2401    common_hv_8ht_8vt_##SIZE##w_lsx(src, srcstride, dst, dststride, hfilter,   \
2402                                    vfilter, h);                               \
2403}                                                                              \
2404                                                                               \
2405void ff_avg_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride,     \
2406                                        const uint8_t *src,                    \
2407                                        ptrdiff_t srcstride,                   \
2408                                        int h, int mx, int my)                 \
2409{                                                                              \
2410    const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1];             \
2411                                                                               \
2412    common_hz_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst,               \
2413                                            dststride, filter, h);             \
2414}                                                                              \
2415                                                                               \
2416void ff_avg_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride,     \
2417                                        const uint8_t *src,                    \
2418                                        ptrdiff_t srcstride,                   \
2419                                        int h, int mx, int my)                 \
2420{                                                                              \
2421    const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1];             \
2422                                                                               \
2423    common_vt_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, dststride,    \
2424                                            filter, h);                        \
2425}                                                                              \
2426                                                                               \
2427void ff_avg_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride,    \
2428                                         const uint8_t *src,                   \
2429                                         ptrdiff_t srcstride,                  \
2430                                         int h, int mx, int my)                \
2431{                                                                              \
2432    const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1];            \
2433    const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1];            \
2434                                                                               \
2435    common_hv_8ht_8vt_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst,          \
2436                                                 dststride, hfilter,           \
2437                                                 vfilter, h);                  \
2438}
2439
2440#define VP9_COPY_LOONGARCH_LSX_FUNC(SIZE)                          \
2441void ff_copy##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride,        \
2442                         const uint8_t *src, ptrdiff_t srcstride,  \
2443                         int h, int mx, int my)                    \
2444{                                                                  \
2445                                                                   \
2446    copy_width##SIZE##_lsx(src, srcstride, dst, dststride, h);     \
2447}                                                                  \
2448void ff_avg##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride,         \
2449                        const uint8_t *src, ptrdiff_t srcstride,   \
2450                        int h, int mx, int my)                     \
2451{                                                                  \
2452                                                                   \
2453    avg_width##SIZE##_lsx(src, srcstride, dst, dststride, h);      \
2454}
2455
2456VP9_8TAP_LOONGARCH_LSX_FUNC(64, regular, FILTER_8TAP_REGULAR);
2457VP9_8TAP_LOONGARCH_LSX_FUNC(32, regular, FILTER_8TAP_REGULAR);
2458VP9_8TAP_LOONGARCH_LSX_FUNC(16, regular, FILTER_8TAP_REGULAR);
2459VP9_8TAP_LOONGARCH_LSX_FUNC(8, regular, FILTER_8TAP_REGULAR);
2460VP9_8TAP_LOONGARCH_LSX_FUNC(4, regular, FILTER_8TAP_REGULAR);
2461
2462VP9_8TAP_LOONGARCH_LSX_FUNC(64, sharp, FILTER_8TAP_SHARP);
2463VP9_8TAP_LOONGARCH_LSX_FUNC(32, sharp, FILTER_8TAP_SHARP);
2464VP9_8TAP_LOONGARCH_LSX_FUNC(16, sharp, FILTER_8TAP_SHARP);
2465VP9_8TAP_LOONGARCH_LSX_FUNC(8, sharp, FILTER_8TAP_SHARP);
2466VP9_8TAP_LOONGARCH_LSX_FUNC(4, sharp, FILTER_8TAP_SHARP);
2467
2468VP9_8TAP_LOONGARCH_LSX_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
2469VP9_8TAP_LOONGARCH_LSX_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
2470VP9_8TAP_LOONGARCH_LSX_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
2471VP9_8TAP_LOONGARCH_LSX_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
2472VP9_8TAP_LOONGARCH_LSX_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
2473
2474VP9_COPY_LOONGARCH_LSX_FUNC(64);
2475VP9_COPY_LOONGARCH_LSX_FUNC(32);
2476VP9_COPY_LOONGARCH_LSX_FUNC(16);
2477VP9_COPY_LOONGARCH_LSX_FUNC(8);
2478
2479#undef VP9_8TAP_LOONGARCH_LSX_FUNC
2480#undef VP9_COPY_LOONGARCH_LSX_FUNC
2481