1/*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * Contributed by Jin Bo <jinbo@loongson.cn>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavcodec/vp9dsp.h"
23#include "libavutil/loongarch/loongson_intrinsics.h"
24#include "libavutil/common.h"
25#include "vp9dsp_loongarch.h"
26
27#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, _in2, \
28                 _in3, _in4, _in5, _in6, _in7)                                  \
29{                                                                               \
30    _in0 = __lsx_vld(_src, 0);                                                  \
31    _in1 = __lsx_vldx(_src, _stride);                                           \
32    _in2 = __lsx_vldx(_src, _stride2);                                          \
33    _in3 = __lsx_vldx(_src, _stride3);                                          \
34    _src += _stride4;                                                           \
35    _in4 = __lsx_vld(_src, 0);                                                  \
36    _in5 = __lsx_vldx(_src, _stride);                                           \
37    _in6 = __lsx_vldx(_src, _stride2);                                          \
38    _in7 = __lsx_vldx(_src, _stride3);                                          \
39}
40
41#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7,        \
42                 _dst, _stride, _stride2, _stride3, _stride4)                   \
43{                                                                               \
44    __lsx_vst(_dst0, _dst, 0);                                                  \
45    __lsx_vstx(_dst1, _dst, _stride);                                           \
46    __lsx_vstx(_dst2, _dst, _stride2);                                          \
47    __lsx_vstx(_dst3, _dst, _stride3);                                          \
48    _dst += _stride4;                                                           \
49    __lsx_vst(_dst4, _dst, 0);                                                  \
50    __lsx_vstx(_dst5, _dst, _stride);                                           \
51    __lsx_vstx(_dst6, _dst, _stride2);                                          \
52    __lsx_vstx(_dst7, _dst, _stride3);                                          \
53}
54
55#define VP9_LPF_FILTER4_4W(p1_src, p0_src, q0_src, q1_src, mask_src, hev_src, \
56                           p1_dst, p0_dst, q0_dst, q1_dst)                    \
57{                                                                             \
58    __m128i p1_tmp, p0_tmp, q0_tmp, q1_tmp, q0_sub_p0, filt, filt1, filt2;    \
59    const __m128i cnst3b = __lsx_vldi(3);                                     \
60    const __m128i cnst4b = __lsx_vldi(4);                                     \
61                                                                              \
62    p1_tmp = __lsx_vxori_b(p1_src, 0x80);                                     \
63    p0_tmp = __lsx_vxori_b(p0_src, 0x80);                                     \
64    q0_tmp = __lsx_vxori_b(q0_src, 0x80);                                     \
65    q1_tmp = __lsx_vxori_b(q1_src, 0x80);                                     \
66                                                                              \
67    filt = __lsx_vssub_b(p1_tmp, q1_tmp);                                     \
68                                                                              \
69    filt = filt & hev_src;                                                    \
70                                                                              \
71    q0_sub_p0 = __lsx_vssub_b(q0_tmp, p0_tmp);                                \
72    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                    \
73    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                    \
74    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                    \
75    filt = filt & mask_src;                                                   \
76                                                                              \
77    filt1 = __lsx_vsadd_b(filt, cnst4b);                                      \
78    filt1 = __lsx_vsrai_b(filt1, 3);                                          \
79                                                                              \
80    filt2 = __lsx_vsadd_b(filt, cnst3b);                                      \
81    filt2 = __lsx_vsrai_b(filt2, 3);                                          \
82                                                                              \
83    q0_tmp = __lsx_vssub_b(q0_tmp, filt1);                                    \
84    q0_dst = __lsx_vxori_b(q0_tmp, 0x80);                                     \
85    p0_tmp = __lsx_vsadd_b(p0_tmp, filt2);                                    \
86    p0_dst = __lsx_vxori_b(p0_tmp, 0x80);                                     \
87                                                                              \
88    filt = __lsx_vsrari_b(filt1, 1);                                          \
89    hev_src = __lsx_vxori_b(hev_src, 0xff);                                   \
90    filt = filt & hev_src;                                                    \
91                                                                              \
92    q1_tmp = __lsx_vssub_b(q1_tmp, filt);                                     \
93    q1_dst = __lsx_vxori_b(q1_tmp, 0x80);                                     \
94    p1_tmp = __lsx_vsadd_b(p1_tmp, filt);                                     \
95    p1_dst = __lsx_vxori_b(p1_tmp, 0x80);                                     \
96}
97
98#define VP9_FLAT4(p3_src, p2_src, p0_src, q0_src, q2_src, q3_src, flat_dst)  \
99{                                                                            \
100    __m128i f_tmp = __lsx_vldi(1);                                           \
101    __m128i p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;              \
102                                                                             \
103    p2_a_sub_p0 = __lsx_vabsd_bu(p2_src, p0_src);                            \
104    q2_a_sub_q0 = __lsx_vabsd_bu(q2_src, q0_src);                            \
105    p3_a_sub_p0 = __lsx_vabsd_bu(p3_src, p0_src);                            \
106    q3_a_sub_q0 = __lsx_vabsd_bu(q3_src, q0_src);                            \
107                                                                             \
108    p2_a_sub_p0 = __lsx_vmax_bu(p2_a_sub_p0, q2_a_sub_q0);                   \
109    flat_dst = __lsx_vmax_bu(p2_a_sub_p0, flat_dst);                         \
110    p3_a_sub_p0 = __lsx_vmax_bu(p3_a_sub_p0, q3_a_sub_q0);                   \
111    flat_dst = __lsx_vmax_bu(p3_a_sub_p0, flat_dst);                         \
112                                                                             \
113    flat_dst = __lsx_vslt_bu(f_tmp, flat_dst);                               \
114    flat_dst = __lsx_vxori_b(flat_dst, 0xff);                                \
115    flat_dst = flat_dst & mask;                                              \
116}
117
118#define VP9_FLAT5(p7_src, p6_src, p5_src, p4_src, p0_src, q0_src, q4_src, \
119                  q5_src, q6_src, q7_src, flat_src, flat2_dst)            \
120{                                                                         \
121    __m128i f_tmp = __lsx_vldi(1);                                        \
122    __m128i p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;           \
123    __m128i p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;           \
124                                                                          \
125    p4_a_sub_p0 = __lsx_vabsd_bu(p4_src, p0_src);                         \
126    q4_a_sub_q0 = __lsx_vabsd_bu(q4_src, q0_src);                         \
127    p5_a_sub_p0 = __lsx_vabsd_bu(p5_src, p0_src);                         \
128    q5_a_sub_q0 = __lsx_vabsd_bu(q5_src, q0_src);                         \
129    p6_a_sub_p0 = __lsx_vabsd_bu(p6_src, p0_src);                         \
130    q6_a_sub_q0 = __lsx_vabsd_bu(q6_src, q0_src);                         \
131    p7_a_sub_p0 = __lsx_vabsd_bu(p7_src, p0_src);                         \
132    q7_a_sub_q0 = __lsx_vabsd_bu(q7_src, q0_src);                         \
133                                                                          \
134    p4_a_sub_p0 = __lsx_vmax_bu(p4_a_sub_p0, q4_a_sub_q0);                \
135    flat2_dst = __lsx_vmax_bu(p5_a_sub_p0, q5_a_sub_q0);                  \
136    flat2_dst = __lsx_vmax_bu(p4_a_sub_p0, flat2_dst);                    \
137    p6_a_sub_p0 = __lsx_vmax_bu(p6_a_sub_p0, q6_a_sub_q0);                \
138    flat2_dst = __lsx_vmax_bu(p6_a_sub_p0, flat2_dst);                    \
139    p7_a_sub_p0 = __lsx_vmax_bu(p7_a_sub_p0, q7_a_sub_q0);                \
140    flat2_dst = __lsx_vmax_bu(p7_a_sub_p0, flat2_dst);                    \
141                                                                          \
142    flat2_dst = __lsx_vslt_bu(f_tmp, flat2_dst);                          \
143    flat2_dst = __lsx_vxori_b(flat2_dst, 0xff);                           \
144    flat2_dst = flat2_dst & flat_src;                                     \
145}
146
147#define VP9_FILTER8(p3_src, p2_src, p1_src, p0_src,            \
148                    q0_src, q1_src, q2_src, q3_src,            \
149                    p2_filt8_dst, p1_filt8_dst, p0_filt8_dst,  \
150                    q0_filt8_dst, q1_filt8_dst, q2_filt8_dst)  \
151{                                                              \
152    __m128i tmp0, tmp1, tmp2;                                  \
153                                                               \
154    tmp2 = __lsx_vadd_h(p2_src, p1_src);                       \
155    tmp2 = __lsx_vadd_h(tmp2, p0_src);                         \
156    tmp0 = __lsx_vslli_h(p3_src, 1);                           \
157                                                               \
158    tmp0 = __lsx_vadd_h(tmp0, tmp2);                           \
159    tmp0 = __lsx_vadd_h(tmp0, q0_src);                         \
160    tmp1 = __lsx_vadd_h(tmp0, p3_src);                         \
161    tmp1 = __lsx_vadd_h(tmp1, p2_src);                         \
162    p2_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
163                                                               \
164    tmp1 = __lsx_vadd_h(tmp0, p1_src);                         \
165    tmp1 = __lsx_vadd_h(tmp1, q1_src);                         \
166    p1_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
167                                                               \
168    tmp1 = __lsx_vadd_h(q2_src, q1_src);                       \
169    tmp1 = __lsx_vadd_h(tmp1, q0_src);                         \
170    tmp2 = __lsx_vadd_h(tmp2, tmp1);                           \
171    tmp0 = __lsx_vadd_h(tmp2, p0_src);                         \
172    tmp0 = __lsx_vadd_h(tmp0, p3_src);                         \
173    p0_filt8_dst = __lsx_vsrari_h(tmp0, 3);                    \
174                                                               \
175    tmp0 = __lsx_vadd_h(q2_src, q3_src);                       \
176    tmp0 = __lsx_vadd_h(tmp0, p0_src);                         \
177    tmp0 = __lsx_vadd_h(tmp0, tmp1);                           \
178    tmp1 = __lsx_vadd_h(q3_src, q3_src);                       \
179    tmp1 = __lsx_vadd_h(tmp1, tmp0);                           \
180    q2_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
181                                                               \
182    tmp0 = __lsx_vadd_h(tmp2, q3_src);                         \
183    tmp1 = __lsx_vadd_h(tmp0, q0_src);                         \
184    q0_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
185                                                               \
186    tmp1 = __lsx_vsub_h(tmp0, p2_src);                         \
187    tmp0 = __lsx_vadd_h(q1_src, q3_src);                       \
188    tmp1 = __lsx_vadd_h(tmp0, tmp1);                           \
189    q1_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
190}
191
192#define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,        \
193                     q2_src, q3_src, limit_src, b_limit_src, thresh_src,    \
194                     hev_dst, mask_dst, flat_dst)                           \
195{                                                                           \
196    __m128i p3_asub_p2_tmp, p2_asub_p1_tmp, p1_asub_p0_tmp, q1_asub_q0_tmp; \
197    __m128i p1_asub_q1_tmp, p0_asub_q0_tmp, q3_asub_q2_tmp, q2_asub_q1_tmp; \
198                                                                            \
199    /* absolute subtraction of pixel values */                              \
200    p3_asub_p2_tmp = __lsx_vabsd_bu(p3_src, p2_src);                        \
201    p2_asub_p1_tmp = __lsx_vabsd_bu(p2_src, p1_src);                        \
202    p1_asub_p0_tmp = __lsx_vabsd_bu(p1_src, p0_src);                        \
203    q1_asub_q0_tmp = __lsx_vabsd_bu(q1_src, q0_src);                        \
204    q2_asub_q1_tmp = __lsx_vabsd_bu(q2_src, q1_src);                        \
205    q3_asub_q2_tmp = __lsx_vabsd_bu(q3_src, q2_src);                        \
206    p0_asub_q0_tmp = __lsx_vabsd_bu(p0_src, q0_src);                        \
207    p1_asub_q1_tmp = __lsx_vabsd_bu(p1_src, q1_src);                        \
208                                                                            \
209    /* calculation of hev */                                                \
210    flat_dst = __lsx_vmax_bu(p1_asub_p0_tmp, q1_asub_q0_tmp);               \
211    hev_dst = __lsx_vslt_bu(thresh_src, flat_dst);                          \
212                                                                            \
213    /* calculation of mask */                                               \
214    p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p0_asub_q0_tmp);        \
215    p1_asub_q1_tmp = __lsx_vsrli_b(p1_asub_q1_tmp, 1);                      \
216    p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p1_asub_q1_tmp);        \
217                                                                            \
218    mask_dst = __lsx_vslt_bu(b_limit_src, p0_asub_q0_tmp);                  \
219    mask_dst = __lsx_vmax_bu(flat_dst, mask_dst);                           \
220    p3_asub_p2_tmp = __lsx_vmax_bu(p3_asub_p2_tmp, p2_asub_p1_tmp);         \
221    mask_dst = __lsx_vmax_bu(p3_asub_p2_tmp, mask_dst);                     \
222    q2_asub_q1_tmp = __lsx_vmax_bu(q2_asub_q1_tmp, q3_asub_q2_tmp);         \
223    mask_dst = __lsx_vmax_bu(q2_asub_q1_tmp, mask_dst);                     \
224                                                                            \
225    mask_dst = __lsx_vslt_bu(limit_src, mask_dst);                          \
226    mask_dst = __lsx_vxori_b(mask_dst, 0xff);                               \
227}
228
229void ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride,
230                              int32_t b_limit_ptr,
231                              int32_t limit_ptr,
232                              int32_t thresh_ptr)
233{
234    ptrdiff_t stride2 = stride << 1;
235    ptrdiff_t stride3 = stride2 + stride;
236    ptrdiff_t stride4 = stride2 << 1;
237    __m128i mask, hev, flat, thresh, b_limit, limit;
238    __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
239
240    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
241              dst, -stride, p3, p2, p1, p0);
242    q0 = __lsx_vld(dst, 0);
243    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
244    q3 = __lsx_vldx(dst, stride3);
245
246    thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
247    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
248    limit   = __lsx_vreplgr2vr_b(limit_ptr);
249
250    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
251                 hev, mask, flat);
252
253    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
254                       q1_out);
255
256    __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
257    __lsx_vstelm_d(p0_out, dst -  stride, 0, 0);
258    __lsx_vstelm_d(q0_out, dst          , 0, 0);
259    __lsx_vstelm_d(q1_out, dst +  stride, 0, 0);
260}
261
262void ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride,
263                                int32_t b_limit_ptr,
264                                int32_t limit_ptr,
265                                int32_t thresh_ptr)
266{
267    ptrdiff_t stride2 = stride << 1;
268    ptrdiff_t stride3 = stride2 + stride;
269    ptrdiff_t stride4 = stride2 << 1;
270    __m128i mask, hev, flat, thresh0, b_limit0;
271    __m128i limit0, thresh1, b_limit1, limit1;
272    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
273
274    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
275              dst, -stride, p3, p2, p1, p0);
276    q0 = __lsx_vld(dst, 0);
277    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
278    q3 = __lsx_vldx(dst, stride3);
279
280    thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
281    thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
282    thresh0 = __lsx_vilvl_d(thresh1, thresh0);
283
284    b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
285    b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
286    b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
287
288    limit0 = __lsx_vreplgr2vr_b(limit_ptr);
289    limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
290    limit0 = __lsx_vilvl_d(limit1, limit0);
291
292    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
293                 hev, mask, flat);
294    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
295
296    __lsx_vst(p1, dst - stride2, 0);
297    __lsx_vst(p0, dst -  stride, 0);
298    __lsx_vst(q0, dst          , 0);
299    __lsx_vst(q1, dst +  stride, 0);
300}
301
302void ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride,
303                              int32_t b_limit_ptr,
304                              int32_t limit_ptr,
305                              int32_t thresh_ptr)
306{
307    ptrdiff_t stride2 = stride << 1;
308    ptrdiff_t stride3 = stride2 + stride;
309    ptrdiff_t stride4 = stride2 << 1;
310    __m128i mask, hev, flat, thresh, b_limit, limit;
311    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
312    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
313    __m128i p2_filter8, p1_filter8, p0_filter8;
314    __m128i q0_filter8, q1_filter8, q2_filter8;
315    __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
316    __m128i zero = __lsx_vldi(0);
317
318    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
319              dst, -stride, p3, p2, p1, p0);
320    q0 = __lsx_vld(dst, 0);
321    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
322    q3 = __lsx_vldx(dst, stride3);
323
324    thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
325    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
326    limit   = __lsx_vreplgr2vr_b(limit_ptr);
327
328    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
329                 hev, mask, flat);
330    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
331    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
332                       q1_out);
333
334    flat = __lsx_vilvl_d(zero, flat);
335
336    /* if flat is zero for all pixels, then no need to calculate other filter */
337    if (__lsx_bz_v(flat)) {
338        __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
339        __lsx_vstelm_d(p0_out, dst -  stride, 0, 0);
340        __lsx_vstelm_d(q0_out, dst          , 0, 0);
341        __lsx_vstelm_d(q1_out, dst +  stride, 0, 0);
342    } else {
343        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
344                  p3_l, p2_l, p1_l, p0_l);
345        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
346                  q0_l, q1_l, q2_l, q3_l);
347        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
348                    p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
349
350        /* convert 16 bit output data into 8 bit */
351        DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8,
352                  zero, p0_filter8, zero, q0_filter8, p2_filter8,
353                  p1_filter8, p0_filter8, q0_filter8);
354        DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8,
355                  q1_filter8, q2_filter8);
356
357        /* store pixel values */
358        p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
359        p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
360        p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
361        q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
362        q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
363        q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
364
365        __lsx_vstelm_d(p2_out, dst - stride3, 0, 0);
366        __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
367        __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
368        __lsx_vstelm_d(q0_out, dst, 0, 0);
369        __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
370        __lsx_vstelm_d(q2_out, dst + stride2, 0, 0);
371    }
372}
373
374void ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride,
375                                int32_t b_limit_ptr,
376                                int32_t limit_ptr,
377                                int32_t thresh_ptr)
378{
379    ptrdiff_t stride2 = stride << 1;
380    ptrdiff_t stride3 = stride2 + stride;
381    ptrdiff_t stride4 = stride2 << 1;
382    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
383    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
384    __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
385    __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
386    __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
387    __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
388    __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
389    __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
390    __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
391    __m128i zero = __lsx_vldi(0);
392
393    /* load vector elements */
394    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
395              dst, -stride, p3, p2, p1, p0);
396    q0 = __lsx_vld(dst, 0);
397    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
398    q3 = __lsx_vldx(dst, stride3);
399
400    thresh = __lsx_vreplgr2vr_b(thresh_ptr);
401    tmp    = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
402    thresh = __lsx_vilvl_d(tmp, thresh);
403
404    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
405    tmp     = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
406    b_limit = __lsx_vilvl_d(tmp, b_limit);
407
408    limit = __lsx_vreplgr2vr_b(limit_ptr);
409    tmp   = __lsx_vreplgr2vr_b(limit_ptr >> 8);
410    limit = __lsx_vilvl_d(tmp, limit);
411
412    /* mask and hev */
413    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
414                 hev, mask, flat);
415    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
416    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
417                       q1_out);
418
419    /* if flat is zero for all pixels, then no need to calculate other filter */
420    if (__lsx_bz_v(flat)) {
421        __lsx_vst(p1_out, dst - stride2, 0);
422        __lsx_vst(p0_out, dst - stride, 0);
423        __lsx_vst(q0_out, dst, 0);
424        __lsx_vst(q1_out, dst + stride, 0);
425    } else {
426        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
427                  p3_l, p2_l, p1_l, p0_l);
428        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
429                  q0_l, q1_l, q2_l, q3_l);
430        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
431                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
432
433        DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
434                  p3_h, p2_h, p1_h, p0_h);
435        DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
436                  q0_h, q1_h, q2_h, q3_h);
437        VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
438                    p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
439
440        /* convert 16 bit output data into 8 bit */
441        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
442                  p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
443                  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
444        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
445                  q2_filt8_l, q1_filt8_l, q2_filt8_l);
446
447        /* store pixel values */
448        p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
449        p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
450        p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
451        q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
452        q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
453        q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
454
455
456        __lsx_vstx(p2_out, dst, -stride3);
457        __lsx_vstx(p1_out, dst, -stride2);
458        __lsx_vstx(p0_out, dst, -stride);
459        __lsx_vst(q0_out, dst, 0);
460        __lsx_vstx(q1_out, dst, stride);
461        __lsx_vstx(q2_out, dst, stride2);
462    }
463}
464
465void ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride,
466                                int32_t b_limit_ptr,
467                                int32_t limit_ptr,
468                                int32_t thresh_ptr)
469{
470    ptrdiff_t stride2 = stride << 1;
471    ptrdiff_t stride3 = stride2 + stride;
472    ptrdiff_t stride4 = stride2 << 1;
473    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
474    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
475    __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
476    __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
477    __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
478    __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
479    __m128i zero = __lsx_vldi(0);
480
481    /* load vector elements */
482    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
483              dst, -stride, p3, p2, p1, p0);
484    q0 = __lsx_vld(dst, 0);
485    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
486    q3 = __lsx_vldx(dst, stride3);
487
488    thresh = __lsx_vreplgr2vr_b(thresh_ptr);
489    tmp    = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
490    thresh = __lsx_vilvl_d(tmp, thresh);
491
492    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
493    tmp     = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
494    b_limit = __lsx_vilvl_d(tmp, b_limit);
495
496    limit = __lsx_vreplgr2vr_b(limit_ptr);
497    tmp   = __lsx_vreplgr2vr_b(limit_ptr >> 8);
498    limit = __lsx_vilvl_d(tmp, limit);
499
500    /* mask and hev */
501    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
502                 hev, mask, flat);
503    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
504    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
505                       q1_out);
506
507    flat = __lsx_vilvl_d(zero, flat);
508
509    /* if flat is zero for all pixels, then no need to calculate other filter */
510    if (__lsx_bz_v(flat)) {
511        __lsx_vstx(p1_out, dst, -stride2);
512        __lsx_vstx(p0_out, dst, -stride);
513        __lsx_vst(q0_out, dst, 0);
514        __lsx_vstx(q1_out, dst, stride);
515    } else {
516        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
517                  p3_l, p2_l, p1_l, p0_l);
518        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
519                  q0_l, q1_l, q2_l, q3_l);
520        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
521                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
522
523        /* convert 16 bit output data into 8 bit */
524        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
525                  p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
526                  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
527        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
528                  q2_filt8_l, q1_filt8_l, q2_filt8_l);
529
530        /* store pixel values */
531        p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
532        p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
533        p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
534        q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
535        q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
536        q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
537
538        __lsx_vstx(p2_out, dst, -stride3);
539        __lsx_vstx(p1_out, dst, -stride2);
540        __lsx_vstx(p0_out, dst, -stride);
541        __lsx_vst(q0_out, dst, 0);
542        __lsx_vstx(q1_out, dst, stride);
543        __lsx_vstx(q2_out, dst, stride2);
544    }
545}
546
547void ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride,
548                                int32_t b_limit_ptr,
549                                int32_t limit_ptr,
550                                int32_t thresh_ptr)
551{
552    ptrdiff_t stride2 = stride << 1;
553    ptrdiff_t stride3 = stride2 + stride;
554    ptrdiff_t stride4 = stride2 << 1;
555    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
556    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
557    __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
558    __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
559    __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
560    __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
561    __m128i zero = { 0 };
562
563    /* load vector elements */
564    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
565              dst, -stride, p3, p2, p1, p0);
566    q0 = __lsx_vld(dst, 0);
567    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
568    q3 = __lsx_vldx(dst, stride3);
569
570    thresh = __lsx_vreplgr2vr_b(thresh_ptr);
571    tmp    = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
572    thresh = __lsx_vilvl_d(tmp, thresh);
573
574    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
575    tmp     = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
576    b_limit = __lsx_vilvl_d(tmp, b_limit);
577
578    limit = __lsx_vreplgr2vr_b(limit_ptr);
579    tmp   = __lsx_vreplgr2vr_b(limit_ptr >> 8);
580    limit = __lsx_vilvl_d(tmp, limit);
581
582    /* mask and hev */
583    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
584                 hev, mask, flat);
585    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
586    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
587                       q1_out);
588
589    flat = __lsx_vilvh_d(flat, zero);
590
591    /* if flat is zero for all pixels, then no need to calculate other filter */
592    if (__lsx_bz_v(flat)) {
593        __lsx_vstx(p1_out, dst, -stride2);
594        __lsx_vstx(p0_out, dst, -stride);
595        __lsx_vst(q0_out, dst, 0);
596        __lsx_vstx(q1_out, dst, stride);
597    } else {
598        DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
599                  p3_h, p2_h, p1_h, p0_h);
600        DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
601                  q0_h, q1_h, q2_h, q3_h);
602        VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
603                    p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
604
605        /* convert 16 bit output data into 8 bit */
606        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
607                  p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
608                  p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
609        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
610                  q2_filt8_h, q1_filt8_h, q2_filt8_h);
611
612        /* store pixel values */
613        p2_out = __lsx_vbitsel_v(p2, p2_filt8_h, flat);
614        p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat);
615        p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat);
616        q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat);
617        q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat);
618        q2_out = __lsx_vbitsel_v(q2, q2_filt8_h, flat);
619
620        __lsx_vstx(p2_out, dst, -stride3);
621        __lsx_vstx(p1_out, dst, -stride2);
622        __lsx_vstx(p0_out, dst, -stride);
623        __lsx_vst(q0_out, dst, 0);
624        __lsx_vstx(q1_out, dst, stride);
625        __lsx_vstx(q2_out, dst, stride2);
626    }
627}
628
629static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *dst, ptrdiff_t stride,
630                                        uint8_t *filter48,
631                                        int32_t b_limit_ptr,
632                                        int32_t limit_ptr,
633                                        int32_t thresh_ptr)
634{
635    ptrdiff_t stride2 = stride << 1;
636    ptrdiff_t stride3 = stride2 + stride;
637    ptrdiff_t stride4 = stride2 << 1;
638    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
639    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
640    __m128i flat, mask, hev, thresh, b_limit, limit;
641    __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
642    __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
643    __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
644    __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
645    __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
646    __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
647    __m128i zero = __lsx_vldi(0);
648
649    /* load vector elements */
650    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
651              dst, -stride, p3, p2, p1, p0);
652    q0 = __lsx_vld(dst, 0);
653    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
654    q3 = __lsx_vldx(dst, stride3);
655
656    thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
657    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
658    limit   = __lsx_vreplgr2vr_b(limit_ptr);
659
660    /* mask and hev */
661    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
662                 hev, mask, flat);
663    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
664    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
665                       q1_out);
666
667    /* if flat is zero for all pixels, then no need to calculate other filter */
668    if (__lsx_bz_v(flat)) {
669        __lsx_vstx(p1_out, dst, -stride2);
670        __lsx_vstx(p0_out, dst, -stride);
671        __lsx_vst(q0_out, dst, 0);
672        __lsx_vstx(q1_out, dst, stride);
673        return 1;
674    } else {
675        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
676                  p3_l, p2_l, p1_l, p0_l);
677        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
678                  q0_l, q1_l, q2_l, q3_l);
679        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
680                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
681
682        DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
683                  p3_h, p2_h, p1_h, p0_h);
684        DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
685                  q0_h, q1_h, q2_h, q3_h);
686        VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
687                    p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
688
689        /* convert 16 bit output data into 8 bit */
690        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
691                  p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
692                  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
693        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
694                  q2_filt8_l, q1_filt8_l, q2_filt8_l);
695
696        /* store pixel values */
697        p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
698        p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
699        p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
700        q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
701        q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
702        q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
703
704        __lsx_vst(p2_out, filter48, 0);
705        __lsx_vst(p1_out, filter48, 16);
706        __lsx_vst(p0_out, filter48, 32);
707        __lsx_vst(q0_out, filter48, 48);
708        __lsx_vst(q1_out, filter48, 64);
709        __lsx_vst(q2_out, filter48, 80);
710        __lsx_vst(flat, filter48, 96);
711
712        return 0;
713    }
714}
715
716static void vp9_hz_lpf_t16_16w(uint8_t *dst, ptrdiff_t stride,
717                               uint8_t *filter48)
718{
719    ptrdiff_t stride2 = stride << 1;
720    ptrdiff_t stride3 = stride2 + stride;
721    ptrdiff_t stride4 = stride2 << 1;
722    uint8_t *dst_tmp = dst - stride4;
723    uint8_t *dst_tmp1 = dst + stride4;
724    __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
725    __m128i flat, flat2, filter8;
726    __m128i zero = __lsx_vldi(0);
727    __m128i out_h, out_l;
728    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
729    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
730    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
731    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
732    v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
733    v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
734    v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
735    v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
736    v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
737
738    flat = __lsx_vld(filter48, 96);
739
740    DUP4_ARG2(__lsx_vldx, dst_tmp, -stride4, dst_tmp, -stride3, dst_tmp,
741              -stride2, dst_tmp, -stride, p7, p6, p5, p4);
742    p3 = __lsx_vld(dst_tmp, 0);
743    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
744    p0 = __lsx_vldx(dst_tmp, stride3);
745
746    q0 = __lsx_vld(dst, 0);
747    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
748    q3 = __lsx_vldx(dst, stride3);
749
750    q4 = __lsx_vld(dst_tmp1, 0);
751    DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
752    q7 = __lsx_vldx(dst_tmp1, stride3);
753    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
754
755    /* if flat2 is zero for all pixels, then no need to calculate other filter */
756    if (__lsx_bz_v(flat2)) {
757        DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48,
758                  48, p2, p1, p0, q0);
759        DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
760
761        __lsx_vstx(p2, dst, -stride3);
762        __lsx_vstx(p1, dst, -stride2);
763        __lsx_vstx(p0, dst, -stride);
764        __lsx_vst(q0, dst, 0);
765        __lsx_vstx(q1, dst, stride);
766        __lsx_vstx(q2, dst, stride2);
767    } else {
768        dst = dst_tmp - stride3;
769
770        p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
771        p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
772        p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
773        p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
774        p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
775        p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
776        p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
777        p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
778
779        q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
780
781        tmp0_l = p7_l_in << 3;
782        tmp0_l -= p7_l_in;
783        tmp0_l += p6_l_in;
784        tmp0_l += q0_l_in;
785        tmp1_l = p6_l_in + p5_l_in;
786        tmp1_l += p4_l_in;
787        tmp1_l += p3_l_in;
788        tmp1_l += p2_l_in;
789        tmp1_l += p1_l_in;
790        tmp1_l += p0_l_in;
791        tmp1_l += tmp0_l;
792
793        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
794
795        p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
796        p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
797        p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
798        p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
799
800        p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
801        p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
802        p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
803        p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
804        q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
805
806        tmp0_h = p7_h_in << 3;
807        tmp0_h -= p7_h_in;
808        tmp0_h += p6_h_in;
809        tmp0_h += q0_h_in;
810        tmp1_h = p6_h_in + p5_h_in;
811        tmp1_h += p4_h_in;
812        tmp1_h += p3_h_in;
813        tmp1_h += p2_h_in;
814        tmp1_h += p1_h_in;
815        tmp1_h += p0_h_in;
816        tmp1_h += tmp0_h;
817
818        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
819
820        out_l = __lsx_vpickev_b(out_h, out_l);
821        p6 = __lsx_vbitsel_v(p6, out_l, flat2);
822        __lsx_vst(p6, dst, 0);
823        dst += stride;
824
825        /* p5 */
826        q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
827        tmp0_l = p5_l_in - p6_l_in;
828        tmp0_l += q1_l_in;
829        tmp0_l -= p7_l_in;
830        tmp1_l += tmp0_l;
831        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
832
833        q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
834        tmp0_h = p5_h_in - p6_h_in;
835        tmp0_h += q1_h_in;
836        tmp0_h -= p7_h_in;
837        tmp1_h += tmp0_h;
838        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
839
840        out_l = __lsx_vpickev_b(out_h, out_l);
841        p5 = __lsx_vbitsel_v(p5, out_l, flat2);
842        __lsx_vst(p5, dst, 0);
843        dst += stride;
844
845        /* p4 */
846        q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
847        tmp0_l = p4_l_in - p5_l_in;
848        tmp0_l += q2_l_in;
849        tmp0_l -= p7_l_in;
850        tmp1_l += tmp0_l;
851        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
852
853        q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
854        tmp0_h = p4_h_in - p5_h_in;
855        tmp0_h += q2_h_in;
856        tmp0_h -= p7_h_in;
857        tmp1_h += tmp0_h;
858        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
859
860        out_l = __lsx_vpickev_b(out_h, out_l);
861        p4 = __lsx_vbitsel_v(p4, out_l, flat2);
862        __lsx_vst(p4, dst, 0);
863        dst += stride;
864
865        /* p3 */
866        q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
867        tmp0_l = p3_l_in - p4_l_in;
868        tmp0_l += q3_l_in;
869        tmp0_l -= p7_l_in;
870        tmp1_l += tmp0_l;
871        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
872
873        q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
874        tmp0_h = p3_h_in - p4_h_in;
875        tmp0_h += q3_h_in;
876        tmp0_h -= p7_h_in;
877        tmp1_h += tmp0_h;
878        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
879
880        out_l = __lsx_vpickev_b(out_h, out_l);
881        p3 = __lsx_vbitsel_v(p3, out_l, flat2);
882        __lsx_vst(p3, dst, 0);
883        dst += stride;
884
885        /* p2 */
886        q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
887        filter8 = __lsx_vld(filter48, 0);
888        tmp0_l = p2_l_in - p3_l_in;
889        tmp0_l += q4_l_in;
890        tmp0_l -= p7_l_in;
891        tmp1_l += tmp0_l;
892        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
893
894        q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
895        tmp0_h = p2_h_in - p3_h_in;
896        tmp0_h += q4_h_in;
897        tmp0_h -= p7_h_in;
898        tmp1_h += tmp0_h;
899        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
900
901        out_l = __lsx_vpickev_b(out_h, out_l);
902        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
903        __lsx_vst(filter8, dst, 0);
904        dst += stride;
905
906        /* p1 */
907        q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
908        filter8 = __lsx_vld(filter48, 16);
909        tmp0_l = p1_l_in - p2_l_in;
910        tmp0_l += q5_l_in;
911        tmp0_l -= p7_l_in;
912        tmp1_l += tmp0_l;
913        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
914
915        q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
916        tmp0_h = p1_h_in - p2_h_in;
917        tmp0_h += q5_h_in;
918        tmp0_h -= p7_h_in;
919        tmp1_h += tmp0_h;
920        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
921
922        out_l = __lsx_vpickev_b(out_h, out_l);
923        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
924        __lsx_vst(filter8, dst, 0);
925        dst += stride;
926
927        /* p0 */
928        q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
929        filter8 = __lsx_vld(filter48, 32);
930        tmp0_l = p0_l_in - p1_l_in;
931        tmp0_l += q6_l_in;
932        tmp0_l -= p7_l_in;
933        tmp1_l += tmp0_l;
934        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
935
936        q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
937        tmp0_h = p0_h_in - p1_h_in;
938        tmp0_h += q6_h_in;
939        tmp0_h -= p7_h_in;
940        tmp1_h += tmp0_h;
941        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
942
943        out_l = __lsx_vpickev_b(out_h, out_l);
944        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
945        __lsx_vst(filter8, dst, 0);
946        dst += stride;
947
948        /* q0 */
949        q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
950        filter8 = __lsx_vld(filter48, 48);
951        tmp0_l = q7_l_in - p0_l_in;
952        tmp0_l += q0_l_in;
953        tmp0_l -= p7_l_in;
954        tmp1_l += tmp0_l;
955        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
956
957        q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
958        tmp0_h = q7_h_in - p0_h_in;
959        tmp0_h += q0_h_in;
960        tmp0_h -= p7_h_in;
961        tmp1_h += tmp0_h;
962        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
963
964        out_l = __lsx_vpickev_b(out_h, out_l);
965        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
966        __lsx_vst(filter8, dst, 0);
967        dst += stride;
968
969        /* q1 */
970        filter8 = __lsx_vld(filter48, 64);
971        tmp0_l = q7_l_in - q0_l_in;
972        tmp0_l += q1_l_in;
973        tmp0_l -= p6_l_in;
974        tmp1_l += tmp0_l;
975        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
976
977        tmp0_h = q7_h_in - q0_h_in;
978        tmp0_h += q1_h_in;
979        tmp0_h -= p6_h_in;
980        tmp1_h += tmp0_h;
981        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
982
983        out_l = __lsx_vpickev_b(out_h, out_l);
984        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
985        __lsx_vst(filter8, dst, 0);
986        dst += stride;
987
988        /* q2 */
989        filter8 = __lsx_vld(filter48, 80);
990        tmp0_l = q7_l_in - q1_l_in;
991        tmp0_l += q2_l_in;
992        tmp0_l -= p5_l_in;
993        tmp1_l += tmp0_l;
994        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
995
996        tmp0_h = q7_h_in - q1_h_in;
997        tmp0_h += q2_h_in;
998        tmp0_h -= p5_h_in;
999        tmp1_h += tmp0_h;
1000        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1001
1002        out_l = __lsx_vpickev_b(out_h, out_l);
1003        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
1004        __lsx_vst(filter8, dst, 0);
1005        dst += stride;
1006
1007        /* q3 */
1008        tmp0_l = q7_l_in - q2_l_in;
1009        tmp0_l += q3_l_in;
1010        tmp0_l -= p4_l_in;
1011        tmp1_l += tmp0_l;
1012        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1013
1014        tmp0_h = q7_h_in - q2_h_in;
1015        tmp0_h += q3_h_in;
1016        tmp0_h -= p4_h_in;
1017        tmp1_h += tmp0_h;
1018        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1019
1020        out_l = __lsx_vpickev_b(out_h, out_l);
1021        q3 = __lsx_vbitsel_v(q3, out_l, flat2);
1022        __lsx_vst(q3, dst, 0);
1023        dst += stride;
1024
1025        /* q4 */
1026        tmp0_l = q7_l_in - q3_l_in;
1027        tmp0_l += q4_l_in;
1028        tmp0_l -= p3_l_in;
1029        tmp1_l += tmp0_l;
1030        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1031
1032        tmp0_h = q7_h_in - q3_h_in;
1033        tmp0_h += q4_h_in;
1034        tmp0_h -= p3_h_in;
1035        tmp1_h += tmp0_h;
1036        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1037
1038        out_l = __lsx_vpickev_b(out_h, out_l);
1039        q4 = __lsx_vbitsel_v(q4, out_l, flat2);
1040        __lsx_vst(q4, dst, 0);
1041        dst += stride;
1042
1043        /* q5 */
1044        tmp0_l = q7_l_in - q4_l_in;
1045        tmp0_l += q5_l_in;
1046        tmp0_l -= p2_l_in;
1047        tmp1_l += tmp0_l;
1048        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1049
1050        tmp0_h = q7_h_in - q4_h_in;
1051        tmp0_h += q5_h_in;
1052        tmp0_h -= p2_h_in;
1053        tmp1_h += tmp0_h;
1054        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1055
1056        out_l = __lsx_vpickev_b(out_h, out_l);
1057        q5 = __lsx_vbitsel_v(q5, out_l, flat2);
1058        __lsx_vst(q5, dst, 0);
1059        dst += stride;
1060
1061        /* q6 */
1062        tmp0_l = q7_l_in - q5_l_in;
1063        tmp0_l += q6_l_in;
1064        tmp0_l -= p1_l_in;
1065        tmp1_l += tmp0_l;
1066        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1067
1068        tmp0_h = q7_h_in - q5_h_in;
1069        tmp0_h += q6_h_in;
1070        tmp0_h -= p1_h_in;
1071        tmp1_h += tmp0_h;
1072        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1073
1074        out_l = __lsx_vpickev_b(out_h, out_l);
1075        q6 = __lsx_vbitsel_v(q6, out_l, flat2);
1076        __lsx_vst(q6, dst, 0);
1077    }
1078}
1079
1080void ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride,
1081                                int32_t b_limit_ptr,
1082                                int32_t limit_ptr,
1083                                int32_t thresh_ptr)
1084{
1085    uint8_t filter48[16 * 8] __attribute__ ((aligned(16)));
1086    uint8_t early_exit = 0;
1087
1088    early_exit = vp9_hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0],
1089                                          b_limit_ptr, limit_ptr, thresh_ptr);
1090
1091    if (0 == early_exit) {
1092        vp9_hz_lpf_t16_16w(dst, stride, filter48);
1093    }
1094}
1095
1096void ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride,
1097                               int32_t b_limit_ptr,
1098                               int32_t limit_ptr,
1099                               int32_t thresh_ptr)
1100{
1101    ptrdiff_t stride2 = stride << 1;
1102    ptrdiff_t stride3 = stride2 + stride;
1103    ptrdiff_t stride4 = stride2 << 1;
1104    uint8_t *dst_tmp = dst - stride4;
1105    uint8_t *dst_tmp1 = dst + stride4;
1106    __m128i zero = __lsx_vldi(0);
1107    __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
1108    __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
1109    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1110    __m128i p0_filter16, p1_filter16;
1111    __m128i p2_filter8, p1_filter8, p0_filter8;
1112    __m128i q0_filter8, q1_filter8, q2_filter8;
1113    __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
1114    __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
1115    __m128i tmp0, tmp1, tmp2;
1116
1117    /* load vector elements */
1118    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
1119              dst, -stride, p3, p2, p1, p0);
1120    q0 = __lsx_vld(dst, 0);
1121    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
1122    q3 = __lsx_vldx(dst, stride3);
1123
1124    thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
1125    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1126    limit   = __lsx_vreplgr2vr_b(limit_ptr);
1127
1128    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1129                 hev, mask, flat);
1130    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1131    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1132                       q1_out);
1133
1134    flat = __lsx_vilvl_d(zero, flat);
1135
1136    /* if flat is zero for all pixels, then no need to calculate other filter */
1137    if (__lsx_bz_v(flat)) {
1138        __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
1139        __lsx_vstelm_d(p0_out, dst -   stride, 0, 0);
1140        __lsx_vstelm_d(q0_out, dst           , 0, 0);
1141        __lsx_vstelm_d(q1_out, dst +   stride, 0, 0);
1142    } else {
1143        /* convert 8 bit input data into 16 bit */
1144        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1145                  p3_l, p2_l, p1_l, p0_l);
1146        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1147                  q0_l, q1_l, q2_l, q3_l);
1148        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l,
1149                    p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1150                    q1_filter8, q2_filter8);
1151
1152        /* convert 16 bit output data into 8 bit */
1153        DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8,
1154                  zero, p0_filter8, zero, q0_filter8, p2_filter8,
1155                  p1_filter8, p0_filter8, q0_filter8);
1156        DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8,
1157                  q1_filter8, q2_filter8);
1158
1159        /* store pixel values */
1160        p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
1161        p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
1162        p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
1163        q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
1164        q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
1165        q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
1166
1167        /* load 16 vector elements */
1168        DUP4_ARG2(__lsx_vld, dst_tmp - stride4, 0, dst_tmp - stride3, 0,
1169                  dst_tmp - stride2, 0, dst_tmp - stride, 0, p7, p6, p5, p4);
1170        DUP4_ARG2(__lsx_vld, dst_tmp1, 0, dst_tmp1 + stride, 0,
1171                dst_tmp1 + stride2, 0, dst_tmp1 + stride3, 0, q4, q5, q6, q7);
1172
1173        VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1174
1175        /* if flat2 is zero for all pixels, then no need to calculate other filter */
1176        if (__lsx_bz_v(flat2)) {
1177            dst -= stride3;
1178            __lsx_vstelm_d(p2_out, dst, 0, 0);
1179            dst += stride;
1180            __lsx_vstelm_d(p1_out, dst, 0, 0);
1181            dst += stride;
1182            __lsx_vstelm_d(p0_out, dst, 0, 0);
1183            dst += stride;
1184            __lsx_vstelm_d(q0_out, dst, 0, 0);
1185            dst += stride;
1186            __lsx_vstelm_d(q1_out, dst, 0, 0);
1187            dst += stride;
1188            __lsx_vstelm_d(q2_out, dst, 0, 0);
1189        } else {
1190            /* LSB(right) 8 pixel operation */
1191            DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4,
1192                      p7_l, p6_l, p5_l, p4_l);
1193            DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7,
1194                      q4_l, q5_l, q6_l, q7_l);
1195
1196            tmp0 = __lsx_vslli_h(p7_l, 3);
1197            tmp0 = __lsx_vsub_h(tmp0, p7_l);
1198            tmp0 = __lsx_vadd_h(tmp0, p6_l);
1199            tmp0 = __lsx_vadd_h(tmp0, q0_l);
1200
1201            dst = dst_tmp - stride3;
1202
1203            /* calculation of p6 and p5 */
1204            tmp1 = __lsx_vadd_h(p6_l, p5_l);
1205            tmp1 = __lsx_vadd_h(tmp1, p4_l);
1206            tmp1 = __lsx_vadd_h(tmp1, p3_l);
1207            tmp1 = __lsx_vadd_h(tmp1, p2_l);
1208            tmp1 = __lsx_vadd_h(tmp1, p1_l);
1209            tmp1 = __lsx_vadd_h(tmp1, p0_l);
1210            tmp1 = __lsx_vadd_h(tmp1, tmp0);
1211
1212            p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1213            tmp0 = __lsx_vsub_h(p5_l, p6_l);
1214            tmp0 = __lsx_vadd_h(tmp0, q1_l);
1215            tmp0 = __lsx_vsub_h(tmp0, p7_l);
1216            tmp1 = __lsx_vadd_h(tmp1, tmp0);
1217
1218            p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1219            DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1220                      p1_filter16, p0_filter16, p1_filter16);
1221            p0_filter16 = __lsx_vbitsel_v(p6, p0_filter16, flat2);
1222            p1_filter16 = __lsx_vbitsel_v(p5, p1_filter16, flat2);
1223            __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1224            dst += stride;
1225            __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1226            dst += stride;
1227
1228            /* calculation of p4 and p3 */
1229            tmp0 = __lsx_vsub_h(p4_l, p5_l);
1230            tmp0 = __lsx_vadd_h(tmp0, q2_l);
1231            tmp0 = __lsx_vsub_h(tmp0, p7_l);
1232            tmp2 = __lsx_vsub_h(p3_l, p4_l);
1233            tmp2 = __lsx_vadd_h(tmp2, q3_l);
1234            tmp2 = __lsx_vsub_h(tmp2, p7_l);
1235            tmp1 = __lsx_vadd_h(tmp1, tmp0);
1236            p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1237            tmp1 = __lsx_vadd_h(tmp1, tmp2);
1238            p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1239            DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1240                      p1_filter16, p0_filter16, p1_filter16);
1241            p0_filter16 = __lsx_vbitsel_v(p4, p0_filter16, flat2);
1242            p1_filter16 = __lsx_vbitsel_v(p3, p1_filter16, flat2);
1243            __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1244            dst += stride;
1245            __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1246            dst += stride;
1247
1248            /* calculation of p2 and p1 */
1249            tmp0 = __lsx_vsub_h(p2_l, p3_l);
1250            tmp0 = __lsx_vadd_h(tmp0, q4_l);
1251            tmp0 = __lsx_vsub_h(tmp0, p7_l);
1252            tmp2 = __lsx_vsub_h(p1_l, p2_l);
1253            tmp2 = __lsx_vadd_h(tmp2, q5_l);
1254            tmp2 = __lsx_vsub_h(tmp2, p7_l);
1255            tmp1 = __lsx_vadd_h(tmp1, tmp0);
1256            p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1257            tmp1 = __lsx_vadd_h(tmp1, tmp2);
1258            p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1259            DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1260                      p1_filter16, p0_filter16, p1_filter16);
1261            p0_filter16 = __lsx_vbitsel_v(p2_out, p0_filter16, flat2);
1262            p1_filter16 = __lsx_vbitsel_v(p1_out, p1_filter16, flat2);
1263            __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1264            dst += stride;
1265            __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1266            dst += stride;
1267
1268            /* calculation of p0 and q0 */
1269            tmp0 = __lsx_vsub_h(p0_l, p1_l);
1270            tmp0 = __lsx_vadd_h(tmp0, q6_l);
1271            tmp0 = __lsx_vsub_h(tmp0, p7_l);
1272            tmp2 = __lsx_vsub_h(q7_l, p0_l);
1273            tmp2 = __lsx_vadd_h(tmp2, q0_l);
1274            tmp2 = __lsx_vsub_h(tmp2, p7_l);
1275            tmp1 = __lsx_vadd_h(tmp1, tmp0);
1276            p0_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1277            tmp1 = __lsx_vadd_h(tmp1, tmp2);
1278            p1_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1279            DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1280                      p1_filter16, p0_filter16, p1_filter16);
1281            p0_filter16 = __lsx_vbitsel_v(p0_out, p0_filter16, flat2);
1282            p1_filter16 = __lsx_vbitsel_v(q0_out, p1_filter16, flat2);
1283            __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1284            dst += stride;
1285            __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1286            dst += stride;
1287
1288            /* calculation of q1 and q2 */
1289            tmp0 = __lsx_vsub_h(q7_l, q0_l);
1290            tmp0 = __lsx_vadd_h(tmp0, q1_l);
1291            tmp0 = __lsx_vsub_h(tmp0, p6_l);
1292            tmp2 = __lsx_vsub_h(q7_l, q1_l);
1293            tmp2 = __lsx_vadd_h(tmp2, q2_l);
1294            tmp2 = __lsx_vsub_h(tmp2, p5_l);
1295            tmp1 = __lsx_vadd_h(tmp1, tmp0);
1296            p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1297            tmp1 = __lsx_vadd_h(tmp1, tmp2);
1298            p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1299            DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1300                      p1_filter16, p0_filter16, p1_filter16);
1301            p0_filter16 = __lsx_vbitsel_v(q1_out, p0_filter16, flat2);
1302            p1_filter16 = __lsx_vbitsel_v(q2_out, p1_filter16, flat2);
1303            __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1304            dst += stride;
1305            __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1306            dst += stride;
1307
1308            /* calculation of q3 and q4 */
1309            tmp0 = __lsx_vsub_h(q7_l, q2_l);
1310            tmp0 = __lsx_vadd_h(tmp0, q3_l);
1311            tmp0 = __lsx_vsub_h(tmp0, p4_l);
1312            tmp2 = __lsx_vsub_h(q7_l, q3_l);
1313            tmp2 = __lsx_vadd_h(tmp2, q4_l);
1314            tmp2 = __lsx_vsub_h(tmp2, p3_l);
1315            tmp1 = __lsx_vadd_h(tmp1, tmp0);
1316            p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1317            tmp1 = __lsx_vadd_h(tmp1, tmp2);
1318            p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1319            DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1320                      p1_filter16, p0_filter16, p1_filter16);
1321            p0_filter16 = __lsx_vbitsel_v(q3, p0_filter16, flat2);
1322            p1_filter16 = __lsx_vbitsel_v(q4, p1_filter16, flat2);
1323            __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1324            dst += stride;
1325            __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1326            dst += stride;
1327
1328            /* calculation of q5 and q6 */
1329            tmp0 = __lsx_vsub_h(q7_l, q4_l);
1330            tmp0 = __lsx_vadd_h(tmp0, q5_l);
1331            tmp0 = __lsx_vsub_h(tmp0, p2_l);
1332            tmp2 = __lsx_vsub_h(q7_l, q5_l);
1333            tmp2 = __lsx_vadd_h(tmp2, q6_l);
1334            tmp2 = __lsx_vsub_h(tmp2, p1_l);
1335            tmp1 = __lsx_vadd_h(tmp1, tmp0);
1336            p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1337            tmp1 = __lsx_vadd_h(tmp1, tmp2);
1338            p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1339            DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1340                      p1_filter16, p0_filter16, p1_filter16);
1341            p0_filter16 = __lsx_vbitsel_v(q5, p0_filter16, flat2);
1342            p1_filter16 = __lsx_vbitsel_v(q6, p1_filter16, flat2);
1343            __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1344            dst += stride;
1345            __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1346        }
1347    }
1348}
1349
1350void ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride,
1351                              int32_t b_limit_ptr,
1352                              int32_t limit_ptr,
1353                              int32_t thresh_ptr)
1354{
1355    ptrdiff_t stride2 = stride << 1;
1356    ptrdiff_t stride3 = stride2 + stride;
1357    ptrdiff_t stride4 = stride2 << 1;
1358    uint8_t *dst_tmp1 = dst - 4;
1359    uint8_t *dst_tmp2 = dst_tmp1 + stride4;
1360    __m128i mask, hev, flat, limit, thresh, b_limit;
1361    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1362    __m128i vec0, vec1, vec2, vec3;
1363
1364    p3 = __lsx_vld(dst_tmp1, 0);
1365    DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, p2, p1);
1366    p0 = __lsx_vldx(dst_tmp1, stride3);
1367    q0 = __lsx_vld(dst_tmp2, 0);
1368    DUP2_ARG2(__lsx_vldx, dst_tmp2, stride, dst_tmp2, stride2, q1, q2);
1369    q3 = __lsx_vldx(dst_tmp2, stride3);
1370
1371    thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
1372    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1373    limit   = __lsx_vreplgr2vr_b(limit_ptr);
1374
1375    LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3,
1376                       p3, p2, p1, p0, q0, q1, q2, q3);
1377    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1378                 hev, mask, flat);
1379    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1380    DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
1381    vec2 = __lsx_vilvl_h(vec1, vec0);
1382    vec3 = __lsx_vilvh_h(vec1, vec0);
1383
1384    dst -= 2;
1385    __lsx_vstelm_w(vec2, dst, 0, 0);
1386    __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1387    __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1388    __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1389    dst += stride4;
1390    __lsx_vstelm_w(vec3, dst, 0, 0);
1391    __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1392    __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1393    __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1394}
1395
1396void ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride,
1397                                int32_t b_limit_ptr,
1398                                int32_t limit_ptr,
1399                                int32_t thresh_ptr)
1400{
1401    ptrdiff_t stride2 = stride << 1;
1402    ptrdiff_t stride3 = stride2 + stride;
1403    ptrdiff_t stride4 = stride2 << 1;
1404    uint8_t *dst_tmp = dst - 4;
1405    __m128i mask, hev, flat;
1406    __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1407    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1408    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
1409    __m128i row8, row9, row10, row11, row12, row13, row14, row15;
1410    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1411
1412    row0 = __lsx_vld(dst_tmp, 0);
1413    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row1, row2);
1414    row3 = __lsx_vldx(dst_tmp, stride3);
1415    dst_tmp += stride4;
1416    row4 = __lsx_vld(dst_tmp, 0);
1417    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1418    row7 = __lsx_vldx(dst_tmp, stride3);
1419    dst_tmp += stride4;
1420    row8 = __lsx_vld(dst_tmp, 0);
1421    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row9, row10);
1422    row11 = __lsx_vldx(dst_tmp, stride3);
1423    dst_tmp += stride4;
1424    row12 = __lsx_vld(dst_tmp, 0);
1425    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1426    row15 = __lsx_vldx(dst_tmp, stride3);
1427
1428    LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
1429                        row8, row9, row10, row11, row12, row13, row14, row15,
1430                        p3, p2, p1, p0, q0, q1, q2, q3);
1431
1432    thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
1433    thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1434    thresh0 = __lsx_vilvl_d(thresh1, thresh0);
1435
1436    b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
1437    b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1438    b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
1439
1440    limit0 = __lsx_vreplgr2vr_b(limit_ptr);
1441    limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1442    limit0 = __lsx_vilvl_d(limit1, limit0);
1443
1444    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1445                 hev, mask, flat);
1446    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1447    DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
1448    tmp2 = __lsx_vilvl_h(tmp1, tmp0);
1449    tmp3 = __lsx_vilvh_h(tmp1, tmp0);
1450    DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
1451    tmp4 = __lsx_vilvl_h(tmp1, tmp0);
1452    tmp5 = __lsx_vilvh_h(tmp1, tmp0);
1453
1454    dst -= 2;
1455    __lsx_vstelm_w(tmp2, dst, 0, 0);
1456    __lsx_vstelm_w(tmp2, dst + stride, 0, 1);
1457    __lsx_vstelm_w(tmp2, dst + stride2, 0, 2);
1458    __lsx_vstelm_w(tmp2, dst + stride3, 0, 3);
1459    dst += stride4;
1460    __lsx_vstelm_w(tmp3, dst, 0, 0);
1461    __lsx_vstelm_w(tmp3, dst + stride, 0, 1);
1462    __lsx_vstelm_w(tmp3, dst + stride2, 0, 2);
1463    __lsx_vstelm_w(tmp3, dst + stride3, 0, 3);
1464    dst += stride4;
1465    __lsx_vstelm_w(tmp4, dst, 0, 0);
1466    __lsx_vstelm_w(tmp4, dst + stride, 0, 1);
1467    __lsx_vstelm_w(tmp4, dst + stride2, 0, 2);
1468    __lsx_vstelm_w(tmp4, dst + stride3, 0, 3);
1469    dst += stride4;
1470    __lsx_vstelm_w(tmp5, dst, 0, 0);
1471    __lsx_vstelm_w(tmp5, dst + stride, 0, 1);
1472    __lsx_vstelm_w(tmp5, dst + stride2, 0, 2);
1473    __lsx_vstelm_w(tmp5, dst + stride3, 0, 3);
1474}
1475
1476void ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride,
1477                              int32_t b_limit_ptr,
1478                              int32_t limit_ptr,
1479                              int32_t thresh_ptr)
1480{
1481    ptrdiff_t stride2 = stride << 1;
1482    ptrdiff_t stride3 = stride2 + stride;
1483    ptrdiff_t stride4 = stride2 << 1;
1484    uint8_t *dst_tmp = dst - 4;
1485    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1486    __m128i p1_out, p0_out, q0_out, q1_out;
1487    __m128i flat, mask, hev, thresh, b_limit, limit;
1488    __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1489    __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1490    __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1491    __m128i vec0, vec1, vec2, vec3, vec4;
1492    __m128i zero = __lsx_vldi(0);
1493
1494    /* load vector elements */
1495    p3 = __lsx_vld(dst_tmp, 0);
1496    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
1497    p0 = __lsx_vldx(dst_tmp, stride3);
1498    dst_tmp += stride4;
1499    q0 = __lsx_vld(dst_tmp, 0);
1500    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
1501    q3 = __lsx_vldx(dst_tmp, stride3);
1502
1503    LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3,
1504                       p3, p2, p1, p0, q0, q1, q2, q3);
1505
1506    thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
1507    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1508    limit   = __lsx_vreplgr2vr_b(limit_ptr);
1509
1510    /* mask and hev */
1511    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1512                 hev, mask, flat);
1513    /* flat4 */
1514    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1515    /* filter4 */
1516    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1517                       q1_out);
1518
1519    flat = __lsx_vilvl_d(zero, flat);
1520
1521    /* if flat is zero for all pixels, then no need to calculate other filter */
1522    if (__lsx_bz_v(flat)) {
1523        /* Store 4 pixels p1-_q1 */
1524        DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1525        vec2 = __lsx_vilvl_h(vec1, vec0);
1526        vec3 = __lsx_vilvh_h(vec1, vec0);
1527
1528        dst -= 2;
1529        __lsx_vstelm_w(vec2, dst, 0, 0);
1530        __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1531        __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1532        __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1533        dst += stride4;
1534        __lsx_vstelm_w(vec3, dst, 0, 0);
1535        __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1536        __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1537        __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1538    } else {
1539        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1540                  p3_l, p2_l, p1_l, p0_l);
1541        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1542                  q0_l, q1_l, q2_l, q3_l);
1543        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1544                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1545        /* convert 16 bit output data into 8 bit */
1546        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
1547                  p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l,
1548                  q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
1549                  q0_filt8_l);
1550        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
1551                  q2_filt8_l, q1_filt8_l, q2_filt8_l);
1552
1553        /* store pixel values */
1554        p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1555        p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1556        p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1557        q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1558        q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1559        q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1560
1561        /* Store 6 pixels p2-_q2 */
1562        DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1563        vec2 = __lsx_vilvl_h(vec1, vec0);
1564        vec3 = __lsx_vilvh_h(vec1, vec0);
1565        vec4 = __lsx_vilvl_b(q2, q1);
1566
1567        dst -= 3;
1568        __lsx_vstelm_w(vec2, dst, 0, 0);
1569        __lsx_vstelm_h(vec4, dst, 4, 0);
1570        dst += stride;
1571        __lsx_vstelm_w(vec2, dst, 0, 1);
1572        __lsx_vstelm_h(vec4, dst, 4, 1);
1573        dst += stride;
1574        __lsx_vstelm_w(vec2, dst, 0, 2);
1575        __lsx_vstelm_h(vec4, dst, 4, 2);
1576        dst += stride;
1577        __lsx_vstelm_w(vec2, dst, 0, 3);
1578        __lsx_vstelm_h(vec4, dst, 4, 3);
1579        dst += stride;
1580        __lsx_vstelm_w(vec3, dst, 0, 0);
1581        __lsx_vstelm_h(vec4, dst, 4, 4);
1582        dst += stride;
1583        __lsx_vstelm_w(vec3, dst, 0, 1);
1584        __lsx_vstelm_h(vec4, dst, 4, 5);
1585        dst += stride;
1586        __lsx_vstelm_w(vec3, dst, 0, 2);
1587        __lsx_vstelm_h(vec4, dst, 4, 6);
1588        dst += stride;
1589        __lsx_vstelm_w(vec3, dst, 0, 3);
1590        __lsx_vstelm_h(vec4, dst, 4, 7);
1591    }
1592}
1593
1594void ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride,
1595                                int32_t b_limit_ptr,
1596                                int32_t limit_ptr,
1597                                int32_t thresh_ptr)
1598{
1599    ptrdiff_t stride2 = stride << 1;
1600    ptrdiff_t stride3 = stride2 + stride;
1601    ptrdiff_t stride4 = stride2 << 1;
1602    uint8_t *dst_tmp = dst - 4;
1603    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1604    __m128i p1_out, p0_out, q0_out, q1_out;
1605    __m128i flat, mask, hev, thresh, b_limit, limit;
1606    __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1607    __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1608    __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1609    __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1610    __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1611    __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1612    __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1613    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1614    __m128i zero = __lsx_vldi(0);
1615
1616    p0 = __lsx_vld(dst_tmp, 0);
1617    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1618    p3 = __lsx_vldx(dst_tmp, stride3);
1619    dst_tmp += stride4;
1620    row4 = __lsx_vld(dst_tmp, 0);
1621    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1622    row7 = __lsx_vldx(dst_tmp, stride3);
1623    dst_tmp += stride4;
1624    q3 = __lsx_vld(dst_tmp, 0);
1625    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1626    q0 = __lsx_vldx(dst_tmp, stride3);
1627    dst_tmp += stride4;
1628    row12 = __lsx_vld(dst_tmp, 0);
1629    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1630    row15 = __lsx_vldx(dst_tmp, stride3);
1631
1632    /* transpose 16x8 matrix into 8x16 */
1633    LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1634                        q3, q2, q1, q0, row12, row13, row14, row15,
1635                        p3, p2, p1, p0, q0, q1, q2, q3);
1636
1637    thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1638    vec0   = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1639    thresh = __lsx_vilvl_d(vec0, thresh);
1640
1641    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1642    vec0    = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1643    b_limit = __lsx_vilvl_d(vec0, b_limit);
1644
1645    limit = __lsx_vreplgr2vr_b(limit_ptr);
1646    vec0  = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1647    limit = __lsx_vilvl_d(vec0, limit);
1648
1649    /* mask and hev */
1650    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1651                 hev, mask, flat);
1652    /* flat4 */
1653    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1654    /* filter4 */
1655    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1656                       q1_out);
1657
1658    /* if flat is zero for all pixels, then no need to calculate other filter */
1659    if (__lsx_bz_v(flat)) {
1660        DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1661        vec2 = __lsx_vilvl_h(vec1, vec0);
1662        vec3 = __lsx_vilvh_h(vec1, vec0);
1663        DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1664        vec4 = __lsx_vilvl_h(vec1, vec0);
1665        vec5 = __lsx_vilvh_h(vec1, vec0);
1666
1667        dst -= 2;
1668        __lsx_vstelm_w(vec2, dst, 0, 0);
1669        __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1670        __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1671        __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1672        dst += stride4;
1673        __lsx_vstelm_w(vec3, dst, 0, 0);
1674        __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1675        __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1676        __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1677        dst += stride4;
1678        __lsx_vstelm_w(vec4, dst, 0, 0);
1679        __lsx_vstelm_w(vec4, dst + stride, 0, 1);
1680        __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
1681        __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
1682        dst += stride4;
1683        __lsx_vstelm_w(vec5, dst, 0, 0);
1684        __lsx_vstelm_w(vec5, dst + stride, 0, 1);
1685        __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
1686        __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
1687    } else {
1688        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1689                  p3_l, p2_l, p1_l, p0_l);
1690        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1691                  q0_l, q1_l, q2_l, q3_l);
1692        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1693                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1694
1695        DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
1696                  p3_h, p2_h, p1_h, p0_h);
1697        DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
1698                  q0_h, q1_h, q2_h, q3_h);
1699
1700        /* filter8 */
1701        VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
1702                    p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
1703
1704        /* convert 16 bit output data into 8 bit */
1705        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
1706                  p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
1707                  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1708        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
1709                  q2_filt8_l, q1_filt8_l, q2_filt8_l);
1710
1711        /* store pixel values */
1712        p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1713        p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1714        p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1715        q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1716        q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1717        q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1718
1719        DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1720        vec3 = __lsx_vilvl_h(vec1, vec0);
1721        vec4 = __lsx_vilvh_h(vec1, vec0);
1722        DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
1723        vec6 = __lsx_vilvl_h(vec1, vec0);
1724        vec7 = __lsx_vilvh_h(vec1, vec0);
1725        vec2 = __lsx_vilvl_b(q2, q1);
1726        vec5 = __lsx_vilvh_b(q2, q1);
1727
1728        dst -= 3;
1729        __lsx_vstelm_w(vec3, dst, 0, 0);
1730        __lsx_vstelm_h(vec2, dst, 4, 0);
1731        dst += stride;
1732        __lsx_vstelm_w(vec3, dst, 0, 1);
1733        __lsx_vstelm_h(vec2, dst, 4, 1);
1734        dst += stride;
1735        __lsx_vstelm_w(vec3, dst, 0, 2);
1736        __lsx_vstelm_h(vec2, dst, 4, 2);
1737        dst += stride;
1738        __lsx_vstelm_w(vec3, dst, 0, 3);
1739        __lsx_vstelm_h(vec2, dst, 4, 3);
1740        dst += stride;
1741        __lsx_vstelm_w(vec4, dst, 0, 0);
1742        __lsx_vstelm_h(vec2, dst, 4, 4);
1743        dst += stride;
1744        __lsx_vstelm_w(vec4, dst, 0, 1);
1745        __lsx_vstelm_h(vec2, dst, 4, 5);
1746        dst += stride;
1747        __lsx_vstelm_w(vec4, dst, 0, 2);
1748        __lsx_vstelm_h(vec2, dst, 4, 6);
1749        dst += stride;
1750        __lsx_vstelm_w(vec4, dst, 0, 3);
1751        __lsx_vstelm_h(vec2, dst, 4, 7);
1752        dst += stride;
1753        __lsx_vstelm_w(vec6, dst, 0, 0);
1754        __lsx_vstelm_h(vec5, dst, 4, 0);
1755        dst += stride;
1756        __lsx_vstelm_w(vec6, dst, 0, 1);
1757        __lsx_vstelm_h(vec5, dst, 4, 1);
1758        dst += stride;
1759        __lsx_vstelm_w(vec6, dst, 0, 2);
1760        __lsx_vstelm_h(vec5, dst, 4, 2);
1761        dst += stride;
1762        __lsx_vstelm_w(vec6, dst, 0, 3);
1763        __lsx_vstelm_h(vec5, dst, 4, 3);
1764        dst += stride;
1765        __lsx_vstelm_w(vec7, dst, 0, 0);
1766        __lsx_vstelm_h(vec5, dst, 4, 4);
1767        dst += stride;
1768        __lsx_vstelm_w(vec7, dst, 0, 1);
1769        __lsx_vstelm_h(vec5, dst, 4, 5);
1770        dst += stride;
1771        __lsx_vstelm_w(vec7, dst, 0, 2);
1772        __lsx_vstelm_h(vec5, dst, 4, 6);
1773        dst += stride;
1774        __lsx_vstelm_w(vec7, dst, 0, 3);
1775        __lsx_vstelm_h(vec5, dst, 4, 7);
1776    }
1777}
1778
1779void ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride,
1780                                int32_t b_limit_ptr,
1781                                int32_t limit_ptr,
1782                                int32_t thresh_ptr)
1783{
1784    ptrdiff_t stride2 = stride << 1;
1785    ptrdiff_t stride3 = stride2 + stride;
1786    ptrdiff_t stride4 = stride2 << 1;
1787    uint8_t *dst_tmp = dst - 4;
1788    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1789    __m128i p1_out, p0_out, q0_out, q1_out;
1790    __m128i flat, mask, hev, thresh, b_limit, limit;
1791    __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1792    __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1793    __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1794    __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1795    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1796    __m128i zero = __lsx_vldi(0);
1797
1798    p0 = __lsx_vld(dst_tmp, 0);
1799    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1800    p3 = __lsx_vldx(dst_tmp, stride3);
1801    dst_tmp += stride4;
1802    row4 = __lsx_vld(dst_tmp, 0);
1803    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1804    row7 = __lsx_vldx(dst_tmp, stride3);
1805    dst_tmp += stride4;
1806    q3 = __lsx_vld(dst_tmp, 0);
1807    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1808    q0 = __lsx_vldx(dst_tmp, stride3);
1809    dst_tmp += stride4;
1810    row12 = __lsx_vld(dst_tmp, 0);
1811    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1812    row15 = __lsx_vldx(dst_tmp, stride3);
1813
1814    /* transpose 16x8 matrix into 8x16 */
1815    LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1816                        q3, q2, q1, q0, row12, row13, row14, row15,
1817                        p3, p2, p1, p0, q0, q1, q2, q3);
1818
1819    thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1820    vec0   = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1821    thresh = __lsx_vilvl_d(vec0, thresh);
1822
1823    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1824    vec0    = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1825    b_limit = __lsx_vilvl_d(vec0, b_limit);
1826
1827    limit = __lsx_vreplgr2vr_b(limit_ptr);
1828    vec0  = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1829    limit = __lsx_vilvl_d(vec0, limit);
1830
1831    /* mask and hev */
1832    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1833                 hev, mask, flat);
1834    /* flat4 */
1835    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1836    /* filter4 */
1837    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1838                       q1_out);
1839
1840    flat = __lsx_vilvl_d(zero, flat);
1841
1842    /* if flat is zero for all pixels, then no need to calculate other filter */
1843    if (__lsx_bz_v(flat)) {
1844        DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1845        vec2 = __lsx_vilvl_h(vec1, vec0);
1846        vec3 = __lsx_vilvh_h(vec1, vec0);
1847        DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1848        vec4 = __lsx_vilvl_h(vec1, vec0);
1849        vec5 = __lsx_vilvh_h(vec1, vec0);
1850
1851        dst -= 2;
1852        __lsx_vstelm_w(vec2, dst, 0, 0);
1853        __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1854        __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1855        __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1856        dst += stride4;
1857        __lsx_vstelm_w(vec3, dst, 0, 0);
1858        __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1859        __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1860        __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1861        dst += stride4;
1862        __lsx_vstelm_w(vec4, dst, 0, 0);
1863        __lsx_vstelm_w(vec4, dst + stride, 0, 1);
1864        __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
1865        __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
1866        dst += stride4;
1867        __lsx_vstelm_w(vec5, dst, 0, 0);
1868        __lsx_vstelm_w(vec5, dst + stride, 0, 1);
1869        __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
1870        __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
1871    } else {
1872        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1873                  p3_l, p2_l, p1_l, p0_l);
1874        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1875                  q0_l, q1_l, q2_l, q3_l);
1876        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1877                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1878
1879        /* convert 16 bit output data into 8 bit */
1880        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1881                  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
1882                  p1_filt8_l, p0_filt8_l, q0_filt8_l);
1883        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1884                  q1_filt8_l, q2_filt8_l);
1885
1886        /* store pixel values */
1887        p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1888        p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1889        p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1890        q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1891        q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1892        q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1893
1894        DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1895        vec3 = __lsx_vilvl_h(vec1, vec0);
1896        vec4 = __lsx_vilvh_h(vec1, vec0);
1897        DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
1898        vec6 = __lsx_vilvl_h(vec1, vec0);
1899        vec7 = __lsx_vilvh_h(vec1, vec0);
1900        vec2 = __lsx_vilvl_b(q2, q1);
1901        vec5 = __lsx_vilvh_b(q2, q1);
1902
1903        dst -= 3;
1904        __lsx_vstelm_w(vec3, dst, 0, 0);
1905        __lsx_vstelm_h(vec2, dst, 4, 0);
1906        dst += stride;
1907        __lsx_vstelm_w(vec3, dst, 0, 1);
1908        __lsx_vstelm_h(vec2, dst, 4, 1);
1909        dst += stride;
1910        __lsx_vstelm_w(vec3, dst, 0, 2);
1911        __lsx_vstelm_h(vec2, dst, 4, 2);
1912        dst += stride;
1913        __lsx_vstelm_w(vec3, dst, 0, 3);
1914        __lsx_vstelm_h(vec2, dst, 4, 3);
1915        dst += stride;
1916        __lsx_vstelm_w(vec4, dst, 0, 0);
1917        __lsx_vstelm_h(vec2, dst, 4, 4);
1918        dst += stride;
1919        __lsx_vstelm_w(vec4, dst, 0, 1);
1920        __lsx_vstelm_h(vec2, dst, 4, 5);
1921        dst += stride;
1922        __lsx_vstelm_w(vec4, dst, 0, 2);
1923        __lsx_vstelm_h(vec2, dst, 4, 6);
1924        dst += stride;
1925        __lsx_vstelm_w(vec4, dst, 0, 3);
1926        __lsx_vstelm_h(vec2, dst, 4, 7);
1927        dst += stride;
1928        __lsx_vstelm_w(vec6, dst, 0, 0);
1929        __lsx_vstelm_h(vec5, dst, 4, 0);
1930        dst += stride;
1931        __lsx_vstelm_w(vec6, dst, 0, 1);
1932        __lsx_vstelm_h(vec5, dst, 4, 1);
1933        dst += stride;
1934        __lsx_vstelm_w(vec6, dst, 0, 2);
1935        __lsx_vstelm_h(vec5, dst, 4, 2);
1936        dst += stride;
1937        __lsx_vstelm_w(vec6, dst, 0, 3);
1938        __lsx_vstelm_h(vec5, dst, 4, 3);
1939        dst += stride;
1940        __lsx_vstelm_w(vec7, dst, 0, 0);
1941        __lsx_vstelm_h(vec5, dst, 4, 4);
1942        dst += stride;
1943        __lsx_vstelm_w(vec7, dst, 0, 1);
1944        __lsx_vstelm_h(vec5, dst, 4, 5);
1945        dst += stride;
1946        __lsx_vstelm_w(vec7, dst, 0, 2);
1947        __lsx_vstelm_h(vec5, dst, 4, 6);
1948        dst += stride;
1949        __lsx_vstelm_w(vec7, dst, 0, 3);
1950        __lsx_vstelm_h(vec5, dst, 4, 7);
1951    }
1952}
1953
1954void ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride,
1955                                int32_t b_limit_ptr,
1956                                int32_t limit_ptr,
1957                                int32_t thresh_ptr)
1958{
1959    ptrdiff_t stride2 = stride << 1;
1960    ptrdiff_t stride3 = stride2 + stride;
1961    ptrdiff_t stride4 = stride2 << 1;
1962    uint8_t *dst_tmp = dst - 4;
1963    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1964    __m128i p1_out, p0_out, q0_out, q1_out;
1965    __m128i flat, mask, hev, thresh, b_limit, limit;
1966    __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1967    __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1968    __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1969    __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1970    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1971    __m128i zero = __lsx_vldi(0);
1972
1973    p0 = __lsx_vld(dst_tmp, 0);
1974    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1975    p3 = __lsx_vldx(dst_tmp, stride3);
1976    dst_tmp += stride4;
1977    row4 = __lsx_vld(dst_tmp, 0);
1978    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1979    row7 = __lsx_vldx(dst_tmp, stride3);
1980    dst_tmp += stride4;
1981    q3 = __lsx_vld(dst_tmp, 0);
1982    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1983    q0 = __lsx_vldx(dst_tmp, stride3);
1984    dst_tmp += stride4;
1985    row12 = __lsx_vld(dst_tmp, 0);
1986    DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1987    row15 = __lsx_vldx(dst_tmp, stride3);
1988
1989    /* transpose 16x8 matrix into 8x16 */
1990    LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1991                        q3, q2, q1, q0, row12, row13, row14, row15,
1992                        p3, p2, p1, p0, q0, q1, q2, q3);
1993
1994    thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1995    vec0   = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1996    thresh = __lsx_vilvl_d(vec0, thresh);
1997
1998    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1999    vec0    = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
2000    b_limit = __lsx_vilvl_d(vec0, b_limit);
2001
2002    limit = __lsx_vreplgr2vr_b(limit_ptr);
2003    vec0  = __lsx_vreplgr2vr_b(limit_ptr >> 8);
2004    limit = __lsx_vilvl_d(vec0, limit);
2005
2006    /* mask and hev */
2007    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2008                 hev, mask, flat);
2009    /* flat4 */
2010    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2011    /* filter4 */
2012    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2013                       q1_out);
2014
2015    flat = __lsx_vilvh_d(flat, zero);
2016
2017    /* if flat is zero for all pixels, then no need to calculate other filter */
2018    if (__lsx_bz_v(flat)) {
2019        DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2020        vec2 = __lsx_vilvl_h(vec1, vec0);
2021        vec3 = __lsx_vilvh_h(vec1, vec0);
2022        DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2023        vec4 = __lsx_vilvl_h(vec1, vec0);
2024        vec5 = __lsx_vilvh_h(vec1, vec0);
2025
2026        dst -= 2;
2027        __lsx_vstelm_w(vec2, dst, 0, 0);
2028        __lsx_vstelm_w(vec2, dst + stride, 0, 1);
2029        __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
2030        __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
2031        dst += stride4;
2032        __lsx_vstelm_w(vec3, dst, 0, 0);
2033        __lsx_vstelm_w(vec3, dst + stride, 0, 1);
2034        __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
2035        __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
2036        dst += stride4;
2037        __lsx_vstelm_w(vec4, dst, 0, 0);
2038        __lsx_vstelm_w(vec4, dst + stride, 0, 1);
2039        __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
2040        __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
2041        dst += stride4;
2042        __lsx_vstelm_w(vec5, dst, 0, 0);
2043        __lsx_vstelm_w(vec5, dst + stride, 0, 1);
2044        __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
2045        __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
2046    } else {
2047        DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
2048                  p3_h, p2_h, p1_h, p0_h);
2049        DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
2050                  q0_h, q1_h, q2_h, q3_h);
2051
2052        VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2053                    p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2054
2055        /* convert 16 bit output data into 8 bit */
2056        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
2057                  p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
2058                  p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
2059        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
2060                  q2_filt8_h, q1_filt8_h, q2_filt8_h);
2061
2062        /* store pixel values */
2063        p2 = __lsx_vbitsel_v(p2, p2_filt8_h, flat);
2064        p1 = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat);
2065        p0 = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat);
2066        q0 = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat);
2067        q1 = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat);
2068        q2 = __lsx_vbitsel_v(q2, q2_filt8_h, flat);
2069
2070        DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2071        vec3 = __lsx_vilvl_h(vec1, vec0);
2072        vec4 = __lsx_vilvh_h(vec1, vec0);
2073        DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
2074        vec6 = __lsx_vilvl_h(vec1, vec0);
2075        vec7 = __lsx_vilvh_h(vec1, vec0);
2076        vec2 = __lsx_vilvl_b(q2, q1);
2077        vec5 = __lsx_vilvh_b(q2, q1);
2078
2079        dst -= 3;
2080        __lsx_vstelm_w(vec3, dst, 0, 0);
2081        __lsx_vstelm_h(vec2, dst, 4, 0);
2082        dst += stride;
2083        __lsx_vstelm_w(vec3, dst, 0, 1);
2084        __lsx_vstelm_h(vec2, dst, 4, 1);
2085        dst += stride;
2086        __lsx_vstelm_w(vec3, dst, 0, 2);
2087        __lsx_vstelm_h(vec2, dst, 4, 2);
2088        dst += stride;
2089        __lsx_vstelm_w(vec3, dst, 0, 3);
2090        __lsx_vstelm_h(vec2, dst, 4, 3);
2091        dst += stride;
2092        __lsx_vstelm_w(vec4, dst, 0, 0);
2093        __lsx_vstelm_h(vec2, dst, 4, 4);
2094        dst += stride;
2095        __lsx_vstelm_w(vec4, dst, 0, 1);
2096        __lsx_vstelm_h(vec2, dst, 4, 5);
2097        dst += stride;
2098        __lsx_vstelm_w(vec4, dst, 0, 2);
2099        __lsx_vstelm_h(vec2, dst, 4, 6);
2100        dst += stride;
2101        __lsx_vstelm_w(vec4, dst, 0, 3);
2102        __lsx_vstelm_h(vec2, dst, 4, 7);
2103        dst += stride;
2104        __lsx_vstelm_w(vec6, dst, 0, 0);
2105        __lsx_vstelm_h(vec5, dst, 4, 0);
2106        dst += stride;
2107        __lsx_vstelm_w(vec6, dst, 0, 1);
2108        __lsx_vstelm_h(vec5, dst, 4, 1);
2109        dst += stride;
2110        __lsx_vstelm_w(vec6, dst, 0, 2);
2111        __lsx_vstelm_h(vec5, dst, 4, 2);
2112        dst += stride;
2113        __lsx_vstelm_w(vec6, dst, 0, 3);
2114        __lsx_vstelm_h(vec5, dst, 4, 3);
2115        dst += stride;
2116        __lsx_vstelm_w(vec7, dst, 0, 0);
2117        __lsx_vstelm_h(vec5, dst, 4, 4);
2118        dst += stride;
2119        __lsx_vstelm_w(vec7, dst, 0, 1);
2120        __lsx_vstelm_h(vec5, dst, 4, 5);
2121        dst += stride;
2122        __lsx_vstelm_w(vec7, dst, 0, 2);
2123        __lsx_vstelm_h(vec5, dst, 4, 6);
2124        dst += stride;
2125        __lsx_vstelm_w(vec7, dst, 0, 3);
2126        __lsx_vstelm_h(vec5, dst, 4, 7);
2127    }
2128}
2129
2130static void vp9_transpose_16x8_to_8x16(uint8_t *input, ptrdiff_t in_pitch,
2131                                       uint8_t *output)
2132{
2133    __m128i p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
2134    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2135    __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2136    ptrdiff_t in_pitch2 = in_pitch << 1;
2137    ptrdiff_t in_pitch3 = in_pitch2 + in_pitch;
2138    ptrdiff_t in_pitch4 = in_pitch2 << 1;
2139
2140    LSX_LD_8(input, in_pitch, in_pitch2, in_pitch3, in_pitch4,
2141             p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
2142    /* 8x8 transpose */
2143    LSX_TRANSPOSE8x8_B(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
2144                       p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
2145    /* 8x8 transpose */
2146    DUP4_ARG2(__lsx_vilvh_b, p5_org, p7_org, p4_org, p6_org, p1_org,
2147              p3_org, p0_org, p2_org, tmp0, tmp1, tmp2, tmp3);
2148    DUP2_ARG2(__lsx_vilvl_b, tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
2149    DUP2_ARG2(__lsx_vilvh_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
2150    DUP2_ARG2(__lsx_vilvl_w, tmp6, tmp4, tmp7, tmp5, q0, q4);
2151    DUP2_ARG2(__lsx_vilvh_w, tmp6, tmp4, tmp7, tmp5, q2, q6);
2152    DUP4_ARG2(__lsx_vbsrl_v, q0, 8, q2, 8, q4, 8, q6, 8, q1, q3, q5, q7);
2153
2154    __lsx_vst(p7, output, 0);
2155    __lsx_vst(p6, output, 16);
2156    __lsx_vst(p5, output, 32);
2157    __lsx_vst(p4, output, 48);
2158    __lsx_vst(p3, output, 64);
2159    __lsx_vst(p2, output, 80);
2160    __lsx_vst(p1, output, 96);
2161    __lsx_vst(p0, output, 112);
2162    __lsx_vst(q0, output, 128);
2163    __lsx_vst(q1, output, 144);
2164    __lsx_vst(q2, output, 160);
2165    __lsx_vst(q3, output, 176);
2166    __lsx_vst(q4, output, 192);
2167    __lsx_vst(q5, output, 208);
2168    __lsx_vst(q6, output, 224);
2169    __lsx_vst(q7, output, 240);
2170}
2171
2172static void vp9_transpose_8x16_to_16x8(uint8_t *input, uint8_t *output,
2173                                       ptrdiff_t out_pitch)
2174{
2175    __m128i p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
2176    __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2177    ptrdiff_t out_pitch2 = out_pitch << 1;
2178    ptrdiff_t out_pitch3 = out_pitch2 + out_pitch;
2179    ptrdiff_t out_pitch4 = out_pitch2 << 1;
2180
2181    DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48,
2182              p7, p6, p5, p4);
2183    DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112,
2184              p3, p2, p1, p0);
2185    DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176,
2186              q0, q1, q2, q3);
2187    DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240,
2188              q4, q5, q6, q7);
2189    LSX_TRANSPOSE16x8_B(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
2190                        q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
2191    LSX_ST_8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o,
2192             output, out_pitch, out_pitch2, out_pitch3, out_pitch4);
2193}
2194
2195static void vp9_transpose_16x16(uint8_t *input, int32_t in_stride,
2196                                uint8_t *output, int32_t out_stride)
2197{
2198    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
2199    __m128i row8, row9, row10, row11, row12, row13, row14, row15;
2200    __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
2201    __m128i tmp2, tmp3;
2202    __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2203    int32_t in_stride2 = in_stride << 1;
2204    int32_t in_stride3 = in_stride2 + in_stride;
2205    int32_t in_stride4 = in_stride2 << 1;
2206    int32_t out_stride2 = out_stride << 1;
2207    int32_t out_stride3 = out_stride2 + out_stride;
2208    int32_t out_stride4 = out_stride2 << 1;
2209
2210    LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4,
2211             row0, row1, row2, row3, row4, row5, row6, row7);
2212    input += in_stride4;
2213    LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4,
2214             row8, row9, row10, row11, row12, row13, row14, row15);
2215
2216    LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
2217                        row8, row9, row10, row11, row12, row13, row14, row15,
2218                        p7, p6, p5, p4, p3, p2, p1, p0);
2219
2220    /* transpose 16x8 matrix into 8x16 */
2221    /* total 8 intermediate register and 32 instructions */
2222    q7 = __lsx_vpackod_d(row8, row0);
2223    q6 = __lsx_vpackod_d(row9, row1);
2224    q5 = __lsx_vpackod_d(row10, row2);
2225    q4 = __lsx_vpackod_d(row11, row3);
2226    q3 = __lsx_vpackod_d(row12, row4);
2227    q2 = __lsx_vpackod_d(row13, row5);
2228    q1 = __lsx_vpackod_d(row14, row6);
2229    q0 = __lsx_vpackod_d(row15, row7);
2230
2231    DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
2232    DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
2233
2234    DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
2235    DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
2236
2237    DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
2238    q0 = __lsx_vpackev_w(tmp3, tmp2);
2239    q4 = __lsx_vpackod_w(tmp3, tmp2);
2240
2241    tmp2 = __lsx_vpackod_h(tmp1, tmp0);
2242    tmp3 = __lsx_vpackod_h(q7, q5);
2243    q2 = __lsx_vpackev_w(tmp3, tmp2);
2244    q6 = __lsx_vpackod_w(tmp3, tmp2);
2245
2246    DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
2247    q1 = __lsx_vpackev_w(tmp3, tmp2);
2248    q5 = __lsx_vpackod_w(tmp3, tmp2);
2249
2250    tmp2 = __lsx_vpackod_h(tmp5, tmp4);
2251    tmp3 = __lsx_vpackod_h(tmp7, tmp6);
2252    q3 = __lsx_vpackev_w(tmp3, tmp2);
2253    q7 = __lsx_vpackod_w(tmp3, tmp2);
2254
2255    LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride,
2256             out_stride2, out_stride3, out_stride4);
2257    output += out_stride4;
2258    LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride,
2259             out_stride2, out_stride3, out_stride4);
2260}
2261
2262static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
2263                                       uint8_t *src_org, int32_t pitch_org,
2264                                       int32_t b_limit_ptr,
2265                                       int32_t limit_ptr,
2266                                       int32_t thresh_ptr)
2267{
2268    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
2269    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2270    __m128i flat, mask, hev, thresh, b_limit, limit;
2271    __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2272    __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2273    __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2274    __m128i vec0, vec1, vec2, vec3;
2275    __m128i zero = __lsx_vldi(0);
2276
2277    /* load vector elements */
2278    DUP4_ARG2(__lsx_vld, src, -64, src, -48, src, -32, src, -16,
2279              p3, p2, p1, p0);
2280    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, q0, q1, q2, q3);
2281
2282    thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
2283    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2284    limit   = __lsx_vreplgr2vr_b(limit_ptr);
2285
2286    /* mask and hev */
2287    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2288                 hev, mask, flat);
2289    /* flat4 */
2290    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2291    /* filter4 */
2292    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2293                       q1_out);
2294
2295    flat = __lsx_vilvl_d(zero, flat);
2296
2297    /* if flat is zero for all pixels, then no need to calculate other filter */
2298    if (__lsx_bz_v(flat)) {
2299        DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2300        vec2 = __lsx_vilvl_h(vec1, vec0);
2301        vec3 = __lsx_vilvh_h(vec1, vec0);
2302
2303        src_org -= 2;
2304        __lsx_vstelm_w(vec2, src_org, 0, 0);
2305        src_org += pitch_org;
2306        __lsx_vstelm_w(vec2, src_org, 0, 1);
2307        src_org += pitch_org;
2308        __lsx_vstelm_w(vec2, src_org, 0, 2);
2309        src_org += pitch_org;
2310        __lsx_vstelm_w(vec2, src_org, 0, 3);
2311        src_org += pitch_org;
2312        __lsx_vstelm_w(vec3, src_org, 0, 0);
2313        src_org += pitch_org;
2314        __lsx_vstelm_w(vec3, src_org, 0, 1);
2315        src_org += pitch_org;
2316        __lsx_vstelm_w(vec3, src_org, 0, 2);
2317        src_org += pitch_org;
2318        __lsx_vstelm_w(vec3, src_org, 0, 3);
2319        return 1;
2320    } else {
2321        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
2322                  p3_l, p2_l, p1_l, p0_l);
2323        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
2324                  q0_l, q1_l, q2_l, q3_l);
2325        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2326                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2327
2328        /* convert 16 bit output data into 8 bit */
2329        p2_l = __lsx_vpickev_b(p2_filt8_l, p2_filt8_l);
2330        p1_l = __lsx_vpickev_b(p1_filt8_l, p1_filt8_l);
2331        p0_l = __lsx_vpickev_b(p0_filt8_l, p0_filt8_l);
2332        q0_l = __lsx_vpickev_b(q0_filt8_l, q0_filt8_l);
2333        q1_l = __lsx_vpickev_b(q1_filt8_l, q1_filt8_l);
2334        q2_l = __lsx_vpickev_b(q2_filt8_l, q2_filt8_l);
2335
2336        /* store pixel values */
2337        p2_out = __lsx_vbitsel_v(p2, p2_l, flat);
2338        p1_out = __lsx_vbitsel_v(p1_out, p1_l, flat);
2339        p0_out = __lsx_vbitsel_v(p0_out, p0_l, flat);
2340        q0_out = __lsx_vbitsel_v(q0_out, q0_l, flat);
2341        q1_out = __lsx_vbitsel_v(q1_out, q1_l, flat);
2342        q2_out = __lsx_vbitsel_v(q2, q2_l, flat);
2343
2344        __lsx_vst(p2_out, filter48, 0);
2345        __lsx_vst(p1_out, filter48, 16);
2346        __lsx_vst(p0_out, filter48, 32);
2347        __lsx_vst(q0_out, filter48, 48);
2348        __lsx_vst(q1_out, filter48, 64);
2349        __lsx_vst(q2_out, filter48, 80);
2350        __lsx_vst(flat, filter48, 96);
2351
2352        return 0;
2353    }
2354}
2355
2356static int32_t vp9_vt_lpf_t16_8w(uint8_t *dst, uint8_t *dst_org,
2357                                 ptrdiff_t stride,
2358                                 uint8_t *filter48)
2359{
2360    __m128i zero = __lsx_vldi(0);
2361    __m128i filter8, flat, flat2;
2362    __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2363    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2364    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2365    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2366    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2367    v8u16 tmp0_l, tmp1_l;
2368    __m128i out_l;
2369    uint8_t *dst_tmp = dst - 128;
2370
2371    /* load vector elements */
2372    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2373              dst_tmp, 48, p7, p6, p5, p4);
2374    DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2375              dst_tmp, 112, p3, p2, p1, p0);
2376    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2377    DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
2378
2379    flat = __lsx_vld(filter48, 96);
2380
2381
2382    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2383
2384    /* if flat2 is zero for all pixels, then no need to calculate other filter */
2385    if (__lsx_bz_v(flat2)) {
2386        __m128i vec0, vec1, vec2, vec3, vec4;
2387
2388        DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2389                  filter48, 48, p2, p1, p0, q0);
2390        DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
2391
2392        DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2393        vec3 = __lsx_vilvl_h(vec1, vec0);
2394        vec4 = __lsx_vilvh_h(vec1, vec0);
2395        vec2 = __lsx_vilvl_b(q2, q1);
2396
2397        dst_org -= 3;
2398        __lsx_vstelm_w(vec3, dst_org, 0, 0);
2399        __lsx_vstelm_h(vec2, dst_org, 4, 0);
2400        dst_org += stride;
2401        __lsx_vstelm_w(vec3, dst_org, 0, 1);
2402        __lsx_vstelm_h(vec2, dst_org, 4, 1);
2403        dst_org += stride;
2404        __lsx_vstelm_w(vec3, dst_org, 0, 2);
2405        __lsx_vstelm_h(vec2, dst_org, 4, 2);
2406        dst_org += stride;
2407        __lsx_vstelm_w(vec3, dst_org, 0, 3);
2408        __lsx_vstelm_h(vec2, dst_org, 4, 3);
2409        dst_org += stride;
2410        __lsx_vstelm_w(vec4, dst_org, 0, 0);
2411        __lsx_vstelm_h(vec2, dst_org, 4, 4);
2412        dst_org += stride;
2413        __lsx_vstelm_w(vec4, dst_org, 0, 1);
2414        __lsx_vstelm_h(vec2, dst_org, 4, 5);
2415        dst_org += stride;
2416        __lsx_vstelm_w(vec4, dst_org, 0, 2);
2417        __lsx_vstelm_h(vec2, dst_org, 4, 6);
2418        dst_org += stride;
2419        __lsx_vstelm_w(vec4, dst_org, 0, 3);
2420        __lsx_vstelm_h(vec2, dst_org, 4, 7);
2421        return 1;
2422    } else {
2423        dst -= 7 * 16;
2424
2425        p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
2426        p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
2427        p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
2428        p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
2429        p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
2430        p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
2431        p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
2432        p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
2433        q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
2434
2435        tmp0_l = p7_l_in << 3;
2436        tmp0_l -= p7_l_in;
2437        tmp0_l += p6_l_in;
2438        tmp0_l += q0_l_in;
2439        tmp1_l = p6_l_in + p5_l_in;
2440        tmp1_l += p4_l_in;
2441        tmp1_l += p3_l_in;
2442        tmp1_l += p2_l_in;
2443        tmp1_l += p1_l_in;
2444        tmp1_l += p0_l_in;
2445        tmp1_l += tmp0_l;
2446
2447        out_l =__lsx_vsrari_h((__m128i)tmp1_l, 4);
2448        out_l =__lsx_vpickev_b(out_l, out_l);
2449        p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2450        __lsx_vstelm_d(p6, dst, 0, 0);
2451        dst += 16;
2452
2453        /* p5 */
2454        q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
2455        tmp0_l = p5_l_in - p6_l_in;
2456        tmp0_l += q1_l_in;
2457        tmp0_l -= p7_l_in;
2458        tmp1_l += tmp0_l;
2459        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2460        out_l = __lsx_vpickev_b(out_l, out_l);
2461        p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2462        __lsx_vstelm_d(p5, dst, 0, 0);
2463        dst += 16;
2464
2465        /* p4 */
2466        q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
2467        tmp0_l = p4_l_in - p5_l_in;
2468        tmp0_l += q2_l_in;
2469        tmp0_l -= p7_l_in;
2470        tmp1_l += tmp0_l;
2471        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2472        out_l = __lsx_vpickev_b(out_l, out_l);
2473        p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2474        __lsx_vstelm_d(p4, dst, 0, 0);
2475        dst += 16;
2476
2477        /* p3 */
2478        q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
2479        tmp0_l = p3_l_in - p4_l_in;
2480        tmp0_l += q3_l_in;
2481        tmp0_l -= p7_l_in;
2482        tmp1_l += tmp0_l;
2483        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2484        out_l = __lsx_vpickev_b(out_l, out_l);
2485        p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2486        __lsx_vstelm_d(p3, dst, 0, 0);
2487        dst += 16;
2488
2489        /* p2 */
2490        q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
2491        filter8 = __lsx_vld(filter48, 0);
2492        tmp0_l = p2_l_in - p3_l_in;
2493        tmp0_l += q4_l_in;
2494        tmp0_l -= p7_l_in;
2495        tmp1_l += tmp0_l;
2496        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2497        out_l = __lsx_vpickev_b(out_l, out_l);
2498        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2499        __lsx_vstelm_d(filter8, dst, 0, 0);
2500        dst += 16;
2501
2502        /* p1 */
2503        q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
2504        filter8 = __lsx_vld(filter48, 16);
2505        tmp0_l = p1_l_in - p2_l_in;
2506        tmp0_l += q5_l_in;
2507        tmp0_l -= p7_l_in;
2508        tmp1_l += tmp0_l;
2509        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2510        out_l = __lsx_vpickev_b(out_l, out_l);
2511        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2512        __lsx_vstelm_d(filter8, dst, 0, 0);
2513        dst += 16;
2514
2515        /* p0 */
2516        q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
2517        filter8 = __lsx_vld(filter48, 32);
2518        tmp0_l = p0_l_in - p1_l_in;
2519        tmp0_l += q6_l_in;
2520        tmp0_l -= p7_l_in;
2521        tmp1_l += tmp0_l;
2522        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2523        out_l = __lsx_vpickev_b(out_l, out_l);
2524        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2525        __lsx_vstelm_d(filter8, dst, 0, 0);
2526        dst += 16;
2527
2528        /* q0 */
2529        q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
2530        filter8 = __lsx_vld(filter48, 48);
2531        tmp0_l = q7_l_in - p0_l_in;
2532        tmp0_l += q0_l_in;
2533        tmp0_l -= p7_l_in;
2534        tmp1_l += tmp0_l;
2535        out_l = __lsx_vsrari_h((v8i16) tmp1_l, 4);
2536        out_l = __lsx_vpickev_b(out_l, out_l);
2537        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2538        __lsx_vstelm_d(filter8, dst, 0, 0);
2539        dst += 16;
2540
2541        /* q1 */
2542        filter8 = __lsx_vld(filter48, 64);
2543        tmp0_l = q7_l_in - q0_l_in;
2544        tmp0_l += q1_l_in;
2545        tmp0_l -= p6_l_in;
2546        tmp1_l += tmp0_l;
2547        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2548        out_l = __lsx_vpickev_b(out_l, out_l);
2549        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2550        __lsx_vstelm_d(filter8, dst, 0, 0);
2551        dst += 16;
2552
2553        /* q2 */
2554        filter8 = __lsx_vld(filter48, 80);
2555        tmp0_l = q7_l_in - q1_l_in;
2556        tmp0_l += q2_l_in;
2557        tmp0_l -= p5_l_in;
2558        tmp1_l += tmp0_l;
2559        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2560        out_l = __lsx_vpickev_b(out_l, out_l);
2561        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2562        __lsx_vstelm_d(filter8, dst, 0, 0);
2563        dst += 16;
2564
2565        /* q3 */
2566        tmp0_l = q7_l_in - q2_l_in;
2567        tmp0_l += q3_l_in;
2568        tmp0_l -= p4_l_in;
2569        tmp1_l += tmp0_l;
2570        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2571        out_l = __lsx_vpickev_b(out_l, out_l);
2572        q3 = __lsx_vbitsel_v(q3, out_l, flat2);
2573        __lsx_vstelm_d(q3, dst, 0, 0);
2574        dst += 16;
2575
2576        /* q4 */
2577        tmp0_l = q7_l_in - q3_l_in;
2578        tmp0_l += q4_l_in;
2579        tmp0_l -= p3_l_in;
2580        tmp1_l += tmp0_l;
2581        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2582        out_l = __lsx_vpickev_b(out_l, out_l);
2583        q4 = __lsx_vbitsel_v(q4, out_l, flat2);
2584        __lsx_vstelm_d(q4, dst, 0, 0);
2585        dst += 16;
2586
2587        /* q5 */
2588        tmp0_l = q7_l_in - q4_l_in;
2589        tmp0_l += q5_l_in;
2590        tmp0_l -= p2_l_in;
2591        tmp1_l += tmp0_l;
2592        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2593        out_l = __lsx_vpickev_b(out_l, out_l);
2594        q5 = __lsx_vbitsel_v(q5, out_l, flat2);
2595        __lsx_vstelm_d(q5, dst, 0, 0);
2596        dst += 16;
2597
2598        /* q6 */
2599        tmp0_l = q7_l_in - q5_l_in;
2600        tmp0_l += q6_l_in;
2601        tmp0_l -= p1_l_in;
2602        tmp1_l += tmp0_l;
2603        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2604        out_l = __lsx_vpickev_b(out_l, out_l);
2605        q6 = __lsx_vbitsel_v(q6, out_l, flat2);
2606        __lsx_vstelm_d(q6, dst, 0, 0);
2607
2608        return 0;
2609    }
2610}
2611
2612void ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride,
2613                               int32_t b_limit_ptr,
2614                               int32_t limit_ptr,
2615                               int32_t thresh_ptr)
2616{
2617    uint8_t early_exit = 0;
2618    uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16)));
2619    uint8_t *filter48 = &transposed_input[16 * 16];
2620
2621    vp9_transpose_16x8_to_8x16(dst - 8, stride, transposed_input);
2622
2623    early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2624                                         &filter48[0], dst, stride,
2625                                         b_limit_ptr, limit_ptr, thresh_ptr);
2626
2627    if (0 == early_exit) {
2628        early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), dst, stride,
2629                                       &filter48[0]);
2630
2631        if (0 == early_exit) {
2632            vp9_transpose_8x16_to_16x8(transposed_input, dst - 8, stride);
2633        }
2634    }
2635}
2636
2637static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
2638                                        uint8_t *dst_org, ptrdiff_t stride,
2639                                        int32_t b_limit_ptr,
2640                                        int32_t limit_ptr,
2641                                        int32_t thresh_ptr)
2642{
2643    ptrdiff_t stride2 = stride << 1;
2644    ptrdiff_t stride3 = stride2 + stride;
2645    ptrdiff_t stride4 = stride2 << 1;
2646    __m128i p3, p2, p1, p0, q3, q2, q1, q0;
2647    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2648    __m128i flat, mask, hev, thresh, b_limit, limit;
2649    __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2650    __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
2651    __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2652    __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2653    __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
2654    __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
2655    __m128i vec0, vec1, vec2, vec3, vec4, vec5;
2656    __m128i zero = __lsx_vldi(0);
2657
2658    /* load vector elements */
2659    DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16,
2660              p3, p2, p1, p0);
2661    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2662
2663    thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
2664    b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2665    limit   = __lsx_vreplgr2vr_b(limit_ptr);
2666
2667    /* mask and hev */
2668    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2669                 hev, mask, flat);
2670    /* flat4 */
2671    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2672    /* filter4 */
2673    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2674                       q1_out);
2675
2676    /* if flat is zero for all pixels, then no need to calculate other filter */
2677    if (__lsx_bz_v(flat)) {
2678        DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2679        vec2 = __lsx_vilvl_h(vec1, vec0);
2680        vec3 = __lsx_vilvh_h(vec1, vec0);
2681        DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2682        vec4 = __lsx_vilvl_h(vec1, vec0);
2683        vec5 = __lsx_vilvh_h(vec1, vec0);
2684
2685        dst_org -= 2;
2686        __lsx_vstelm_w(vec2, dst_org, 0, 0);
2687        __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
2688        __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
2689        __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
2690        dst_org += stride4;
2691        __lsx_vstelm_w(vec3, dst_org, 0, 0);
2692        __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
2693        __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
2694        __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
2695        dst_org += stride4;
2696        __lsx_vstelm_w(vec4, dst_org, 0, 0);
2697        __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
2698        __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
2699        __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
2700        dst_org += stride4;
2701        __lsx_vstelm_w(vec5, dst_org, 0, 0);
2702        __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
2703        __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
2704        __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
2705
2706        return 1;
2707    } else {
2708        DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
2709                  p3_l, p2_l, p1_l, p0_l);
2710        DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
2711                  q0_l, q1_l, q2_l, q3_l);
2712        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2713                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2714        DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
2715                  p3_h, p2_h, p1_h, p0_h);
2716        DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
2717                  q0_h, q1_h, q2_h, q3_h);
2718        VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2719                    p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2720
2721        /* convert 16 bit output data into 8 bit */
2722        DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
2723                      p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h,
2724                      q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
2725                      q0_filt8_l);
2726        DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
2727                  q2_filt8_l, q1_filt8_l, q2_filt8_l);
2728
2729        /* store pixel values */
2730        p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
2731        p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
2732        p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
2733        q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
2734        q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
2735        q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
2736
2737        __lsx_vst(p2_out, filter48, 0);
2738        __lsx_vst(p1_out, filter48, 16);
2739        __lsx_vst(p0_out, filter48, 32);
2740        __lsx_vst(q0_out, filter48, 48);
2741        __lsx_vst(q1_out, filter48, 64);
2742        __lsx_vst(q2_out, filter48, 80);
2743        __lsx_vst(flat, filter48, 96);
2744
2745        return 0;
2746    }
2747}
2748
2749static int32_t vp9_vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org,
2750                                  ptrdiff_t stride,
2751                                  uint8_t *filter48)
2752{
2753    __m128i zero = __lsx_vldi(0);
2754    __m128i flat, flat2, filter8;
2755    __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2756    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2757    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2758    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2759    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2760    v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
2761    v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
2762    v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
2763    v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
2764    v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
2765    __m128i out_l, out_h;
2766    uint8_t *dst_tmp = dst - 128;
2767
2768    flat = __lsx_vld(filter48, 96);
2769
2770    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2771              dst_tmp, 48, p7, p6, p5, p4);
2772    DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2773              dst_tmp, 112, p3, p2, p1, p0);
2774    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2775    DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
2776
2777    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2778
2779    /* if flat2 is zero for all pixels, then no need to calculate other filter */
2780    if (__lsx_bz_v(flat2)) {
2781        __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2782
2783        DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2784                  filter48, 48, p2, p1, p0, q0);
2785        DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
2786
2787        DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2788        vec3 = __lsx_vilvl_h(vec1, vec0);
2789        vec4 = __lsx_vilvh_h(vec1, vec0);
2790        DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
2791        vec6 = __lsx_vilvl_h(vec1, vec0);
2792        vec7 = __lsx_vilvh_h(vec1, vec0);
2793        vec2 = __lsx_vilvl_b(q2, q1);
2794        vec5 = __lsx_vilvh_b(q2, q1);
2795
2796        dst_org -= 3;
2797        __lsx_vstelm_w(vec3, dst_org, 0, 0);
2798        __lsx_vstelm_h(vec2, dst_org, 4, 0);
2799        dst_org += stride;
2800        __lsx_vstelm_w(vec3, dst_org, 0, 1);
2801        __lsx_vstelm_h(vec2, dst_org, 4, 1);
2802        dst_org += stride;
2803        __lsx_vstelm_w(vec3, dst_org, 0, 2);
2804        __lsx_vstelm_h(vec2, dst_org, 4, 2);
2805        dst_org += stride;
2806        __lsx_vstelm_w(vec3, dst_org, 0, 3);
2807        __lsx_vstelm_h(vec2, dst_org, 4, 3);
2808        dst_org += stride;
2809        __lsx_vstelm_w(vec4, dst_org, 0, 0);
2810        __lsx_vstelm_h(vec2, dst_org, 4, 4);
2811        dst_org += stride;
2812        __lsx_vstelm_w(vec4, dst_org, 0, 1);
2813        __lsx_vstelm_h(vec2, dst_org, 4, 5);
2814        dst_org += stride;
2815        __lsx_vstelm_w(vec4, dst_org, 0, 2);
2816        __lsx_vstelm_h(vec2, dst_org, 4, 6);
2817        dst_org += stride;
2818        __lsx_vstelm_w(vec4, dst_org, 0, 3);
2819        __lsx_vstelm_h(vec2, dst_org, 4, 7);
2820        dst_org += stride;
2821        __lsx_vstelm_w(vec6, dst_org, 0, 0);
2822        __lsx_vstelm_h(vec5, dst_org, 4, 0);
2823        dst_org += stride;
2824        __lsx_vstelm_w(vec6, dst_org, 0, 1);
2825        __lsx_vstelm_h(vec5, dst_org, 4, 1);
2826        dst_org += stride;
2827        __lsx_vstelm_w(vec6, dst_org, 0, 2);
2828        __lsx_vstelm_h(vec5, dst_org, 4, 2);
2829        dst_org += stride;
2830        __lsx_vstelm_w(vec6, dst_org, 0, 3);
2831        __lsx_vstelm_h(vec5, dst_org, 4, 3);
2832        dst_org += stride;
2833        __lsx_vstelm_w(vec7, dst_org, 0, 0);
2834        __lsx_vstelm_h(vec5, dst_org, 4, 4);
2835        dst_org += stride;
2836        __lsx_vstelm_w(vec7, dst_org, 0, 1);
2837        __lsx_vstelm_h(vec5, dst_org, 4, 5);
2838        dst_org += stride;
2839        __lsx_vstelm_w(vec7, dst_org, 0, 2);
2840        __lsx_vstelm_h(vec5, dst_org, 4, 6);
2841        dst_org += stride;
2842        __lsx_vstelm_w(vec7, dst_org, 0, 3);
2843        __lsx_vstelm_h(vec5, dst_org, 4, 7);
2844
2845        return 1;
2846    } else {
2847        dst -= 7 * 16;
2848
2849        p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
2850        p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
2851        p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
2852        p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
2853        p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
2854        p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
2855        p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
2856        p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
2857        q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
2858
2859        tmp0_l = p7_l_in << 3;
2860        tmp0_l -= p7_l_in;
2861        tmp0_l += p6_l_in;
2862        tmp0_l += q0_l_in;
2863        tmp1_l = p6_l_in + p5_l_in;
2864        tmp1_l += p4_l_in;
2865        tmp1_l += p3_l_in;
2866        tmp1_l += p2_l_in;
2867        tmp1_l += p1_l_in;
2868        tmp1_l += p0_l_in;
2869        tmp1_l += tmp0_l;
2870        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2871
2872        p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
2873        p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
2874        p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
2875        p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
2876        p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
2877        p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
2878        p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
2879        p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
2880        q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
2881
2882        tmp0_h = p7_h_in << 3;
2883        tmp0_h -= p7_h_in;
2884        tmp0_h += p6_h_in;
2885        tmp0_h += q0_h_in;
2886        tmp1_h = p6_h_in + p5_h_in;
2887        tmp1_h += p4_h_in;
2888        tmp1_h += p3_h_in;
2889        tmp1_h += p2_h_in;
2890        tmp1_h += p1_h_in;
2891        tmp1_h += p0_h_in;
2892        tmp1_h += tmp0_h;
2893        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2894
2895        out_l = __lsx_vpickev_b(out_h, out_l);
2896        p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2897        __lsx_vst(p6, dst, 0);
2898
2899        /* p5 */
2900        q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
2901        tmp0_l = p5_l_in - p6_l_in;
2902        tmp0_l += q1_l_in;
2903        tmp0_l -= p7_l_in;
2904        tmp1_l += tmp0_l;
2905        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2906        q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
2907        tmp0_h = p5_h_in - p6_h_in;
2908        tmp0_h += q1_h_in;
2909        tmp0_h -= p7_h_in;
2910        tmp1_h += tmp0_h;
2911        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2912        out_l = __lsx_vpickev_b(out_h, out_l);
2913        p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2914        __lsx_vst(p5, dst, 16);
2915
2916        /* p4 */
2917        q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
2918        tmp0_l = p4_l_in - p5_l_in;
2919        tmp0_l += q2_l_in;
2920        tmp0_l -= p7_l_in;
2921        tmp1_l += tmp0_l;
2922        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2923        q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
2924        tmp0_h = p4_h_in - p5_h_in;
2925        tmp0_h += q2_h_in;
2926        tmp0_h -= p7_h_in;
2927        tmp1_h += tmp0_h;
2928        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2929        out_l = __lsx_vpickev_b(out_h, out_l);
2930        p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2931        __lsx_vst(p4, dst, 16*2);
2932
2933        /* p3 */
2934        q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
2935        tmp0_l = p3_l_in - p4_l_in;
2936        tmp0_l += q3_l_in;
2937        tmp0_l -= p7_l_in;
2938        tmp1_l += tmp0_l;
2939        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2940        q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
2941        tmp0_h = p3_h_in - p4_h_in;
2942        tmp0_h += q3_h_in;
2943        tmp0_h -= p7_h_in;
2944        tmp1_h += tmp0_h;
2945        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2946        out_l = __lsx_vpickev_b(out_h, out_l);
2947        p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2948        __lsx_vst(p3, dst, 16*3);
2949
2950        /* p2 */
2951        q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
2952        filter8 = __lsx_vld(filter48, 0);
2953        tmp0_l = p2_l_in - p3_l_in;
2954        tmp0_l += q4_l_in;
2955        tmp0_l -= p7_l_in;
2956        tmp1_l += tmp0_l;
2957        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2958        q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
2959        tmp0_h = p2_h_in - p3_h_in;
2960        tmp0_h += q4_h_in;
2961        tmp0_h -= p7_h_in;
2962        tmp1_h += tmp0_h;
2963        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2964        out_l = __lsx_vpickev_b(out_h, out_l);
2965        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2966        __lsx_vst(filter8, dst, 16*4);
2967
2968        /* p1 */
2969        q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
2970        filter8 = __lsx_vld(filter48, 16);
2971        tmp0_l = p1_l_in - p2_l_in;
2972        tmp0_l += q5_l_in;
2973        tmp0_l -= p7_l_in;
2974        tmp1_l += tmp0_l;
2975        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2976        q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
2977        tmp0_h = p1_h_in - p2_h_in;
2978        tmp0_h += q5_h_in;
2979        tmp0_h -= p7_h_in;
2980        tmp1_h += tmp0_h;
2981        out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
2982        out_l = __lsx_vpickev_b(out_h, out_l);
2983        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2984        __lsx_vst(filter8, dst, 16*5);
2985
2986        /* p0 */
2987        q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
2988        filter8 = __lsx_vld(filter48, 32);
2989        tmp0_l = p0_l_in - p1_l_in;
2990        tmp0_l += q6_l_in;
2991        tmp0_l -= p7_l_in;
2992        tmp1_l += tmp0_l;
2993        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2994        q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
2995        tmp0_h = p0_h_in - p1_h_in;
2996        tmp0_h += q6_h_in;
2997        tmp0_h -= p7_h_in;
2998        tmp1_h += tmp0_h;
2999        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3000        out_l = __lsx_vpickev_b(out_h, out_l);
3001        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3002        __lsx_vst(filter8, dst, 16*6);
3003
3004        /* q0 */
3005        q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
3006        filter8 = __lsx_vld(filter48, 48);
3007        tmp0_l = q7_l_in - p0_l_in;
3008        tmp0_l += q0_l_in;
3009        tmp0_l -= p7_l_in;
3010        tmp1_l += tmp0_l;
3011        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3012        q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
3013        tmp0_h = q7_h_in - p0_h_in;
3014        tmp0_h += q0_h_in;
3015        tmp0_h -= p7_h_in;
3016        tmp1_h += tmp0_h;
3017        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3018        out_l = __lsx_vpickev_b(out_h, out_l);
3019        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3020        __lsx_vst(filter8, dst, 16*7);
3021
3022        /* q1 */
3023        filter8 = __lsx_vld(filter48, 64);
3024        tmp0_l = q7_l_in - q0_l_in;
3025        tmp0_l += q1_l_in;
3026        tmp0_l -= p6_l_in;
3027        tmp1_l += tmp0_l;
3028        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3029        tmp0_h = q7_h_in - q0_h_in;
3030        tmp0_h += q1_h_in;
3031        tmp0_h -= p6_h_in;
3032        tmp1_h += tmp0_h;
3033        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3034        out_l = __lsx_vpickev_b(out_h, out_l);
3035        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3036        __lsx_vst(filter8, dst, 16*8);
3037
3038        /* q2 */
3039        filter8 = __lsx_vld(filter48, 80);
3040        tmp0_l = q7_l_in - q1_l_in;
3041        tmp0_l += q2_l_in;
3042        tmp0_l -= p5_l_in;
3043        tmp1_l += tmp0_l;
3044        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3045        tmp0_h = q7_h_in - q1_h_in;
3046        tmp0_h += q2_h_in;
3047        tmp0_h -= p5_h_in;
3048        tmp1_h += tmp0_h;
3049        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3050        out_l = __lsx_vpickev_b(out_h, out_l);
3051        filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3052        __lsx_vst(filter8, dst, 16*9);
3053
3054        /* q3 */
3055        tmp0_l = q7_l_in - q2_l_in;
3056        tmp0_l += q3_l_in;
3057        tmp0_l -= p4_l_in;
3058        tmp1_l += tmp0_l;
3059        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3060        tmp0_h = q7_h_in - q2_h_in;
3061        tmp0_h += q3_h_in;
3062        tmp0_h -= p4_h_in;
3063        tmp1_h += tmp0_h;
3064        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3065        out_l = __lsx_vpickev_b(out_h, out_l);
3066        q3 = __lsx_vbitsel_v(q3, out_l, flat2);
3067        __lsx_vst(q3, dst, 16*10);
3068
3069        /* q4 */
3070        tmp0_l = q7_l_in - q3_l_in;
3071        tmp0_l += q4_l_in;
3072        tmp0_l -= p3_l_in;
3073        tmp1_l += tmp0_l;
3074        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3075        tmp0_h = q7_h_in - q3_h_in;
3076        tmp0_h += q4_h_in;
3077        tmp0_h -= p3_h_in;
3078        tmp1_h += tmp0_h;
3079        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3080        out_l = __lsx_vpickev_b(out_h, out_l);
3081        q4 = __lsx_vbitsel_v(q4, out_l, flat2);
3082        __lsx_vst(q4, dst, 16*11);
3083
3084        /* q5 */
3085        tmp0_l = q7_l_in - q4_l_in;
3086        tmp0_l += q5_l_in;
3087        tmp0_l -= p2_l_in;
3088        tmp1_l += tmp0_l;
3089        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3090        tmp0_h = q7_h_in - q4_h_in;
3091        tmp0_h += q5_h_in;
3092        tmp0_h -= p2_h_in;
3093        tmp1_h += tmp0_h;
3094        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3095        out_l = __lsx_vpickev_b(out_h, out_l);
3096        q5 = __lsx_vbitsel_v(q5, out_l, flat2);
3097        __lsx_vst(q5, dst, 16*12);
3098
3099        /* q6 */
3100        tmp0_l = q7_l_in - q5_l_in;
3101        tmp0_l += q6_l_in;
3102        tmp0_l -= p1_l_in;
3103        tmp1_l += tmp0_l;
3104        out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3105        tmp0_h = q7_h_in - q5_h_in;
3106        tmp0_h += q6_h_in;
3107        tmp0_h -= p1_h_in;
3108        tmp1_h += tmp0_h;
3109        out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3110        out_l = __lsx_vpickev_b(out_h, out_l);
3111        q6 = __lsx_vbitsel_v(q6, out_l, flat2);
3112        __lsx_vst(q6, dst, 16*13);
3113
3114        return 0;
3115    }
3116}
3117
3118void ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride,
3119                                int32_t b_limit_ptr,
3120                                int32_t limit_ptr,
3121                                int32_t thresh_ptr)
3122{
3123    uint8_t early_exit = 0;
3124    uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16)));
3125    uint8_t *filter48 = &transposed_input[16 * 16];
3126
3127    vp9_transpose_16x16((dst - 8), stride, &transposed_input[0], 16);
3128
3129    early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
3130                                          &filter48[0], dst, stride,
3131                                          b_limit_ptr, limit_ptr, thresh_ptr);
3132
3133    if (0 == early_exit) {
3134        early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), dst,
3135                                         stride, &filter48[0]);
3136
3137        if (0 == early_exit) {
3138            vp9_transpose_16x16(transposed_input, 16, (dst - 8), stride);
3139        }
3140    }
3141}
3142