1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Jin Bo <jinbo@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavcodec/vp9dsp.h"
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 #include "libavutil/common.h"
25 #include "vp9dsp_loongarch.h"
26 
27 #define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, _in2, \
28                  _in3, _in4, _in5, _in6, _in7)                                  \
29 {                                                                               \
30     _in0 = __lsx_vld(_src, 0);                                                  \
31     _in1 = __lsx_vldx(_src, _stride);                                           \
32     _in2 = __lsx_vldx(_src, _stride2);                                          \
33     _in3 = __lsx_vldx(_src, _stride3);                                          \
34     _src += _stride4;                                                           \
35     _in4 = __lsx_vld(_src, 0);                                                  \
36     _in5 = __lsx_vldx(_src, _stride);                                           \
37     _in6 = __lsx_vldx(_src, _stride2);                                          \
38     _in7 = __lsx_vldx(_src, _stride3);                                          \
39 }
40 
41 #define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7,        \
42                  _dst, _stride, _stride2, _stride3, _stride4)                   \
43 {                                                                               \
44     __lsx_vst(_dst0, _dst, 0);                                                  \
45     __lsx_vstx(_dst1, _dst, _stride);                                           \
46     __lsx_vstx(_dst2, _dst, _stride2);                                          \
47     __lsx_vstx(_dst3, _dst, _stride3);                                          \
48     _dst += _stride4;                                                           \
49     __lsx_vst(_dst4, _dst, 0);                                                  \
50     __lsx_vstx(_dst5, _dst, _stride);                                           \
51     __lsx_vstx(_dst6, _dst, _stride2);                                          \
52     __lsx_vstx(_dst7, _dst, _stride3);                                          \
53 }
54 
55 #define VP9_LPF_FILTER4_4W(p1_src, p0_src, q0_src, q1_src, mask_src, hev_src, \
56                            p1_dst, p0_dst, q0_dst, q1_dst)                    \
57 {                                                                             \
58     __m128i p1_tmp, p0_tmp, q0_tmp, q1_tmp, q0_sub_p0, filt, filt1, filt2;    \
59     const __m128i cnst3b = __lsx_vldi(3);                                     \
60     const __m128i cnst4b = __lsx_vldi(4);                                     \
61                                                                               \
62     p1_tmp = __lsx_vxori_b(p1_src, 0x80);                                     \
63     p0_tmp = __lsx_vxori_b(p0_src, 0x80);                                     \
64     q0_tmp = __lsx_vxori_b(q0_src, 0x80);                                     \
65     q1_tmp = __lsx_vxori_b(q1_src, 0x80);                                     \
66                                                                               \
67     filt = __lsx_vssub_b(p1_tmp, q1_tmp);                                     \
68                                                                               \
69     filt = filt & hev_src;                                                    \
70                                                                               \
71     q0_sub_p0 = __lsx_vssub_b(q0_tmp, p0_tmp);                                \
72     filt = __lsx_vsadd_b(filt, q0_sub_p0);                                    \
73     filt = __lsx_vsadd_b(filt, q0_sub_p0);                                    \
74     filt = __lsx_vsadd_b(filt, q0_sub_p0);                                    \
75     filt = filt & mask_src;                                                   \
76                                                                               \
77     filt1 = __lsx_vsadd_b(filt, cnst4b);                                      \
78     filt1 = __lsx_vsrai_b(filt1, 3);                                          \
79                                                                               \
80     filt2 = __lsx_vsadd_b(filt, cnst3b);                                      \
81     filt2 = __lsx_vsrai_b(filt2, 3);                                          \
82                                                                               \
83     q0_tmp = __lsx_vssub_b(q0_tmp, filt1);                                    \
84     q0_dst = __lsx_vxori_b(q0_tmp, 0x80);                                     \
85     p0_tmp = __lsx_vsadd_b(p0_tmp, filt2);                                    \
86     p0_dst = __lsx_vxori_b(p0_tmp, 0x80);                                     \
87                                                                               \
88     filt = __lsx_vsrari_b(filt1, 1);                                          \
89     hev_src = __lsx_vxori_b(hev_src, 0xff);                                   \
90     filt = filt & hev_src;                                                    \
91                                                                               \
92     q1_tmp = __lsx_vssub_b(q1_tmp, filt);                                     \
93     q1_dst = __lsx_vxori_b(q1_tmp, 0x80);                                     \
94     p1_tmp = __lsx_vsadd_b(p1_tmp, filt);                                     \
95     p1_dst = __lsx_vxori_b(p1_tmp, 0x80);                                     \
96 }
97 
98 #define VP9_FLAT4(p3_src, p2_src, p0_src, q0_src, q2_src, q3_src, flat_dst)  \
99 {                                                                            \
100     __m128i f_tmp = __lsx_vldi(1);                                           \
101     __m128i p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;              \
102                                                                              \
103     p2_a_sub_p0 = __lsx_vabsd_bu(p2_src, p0_src);                            \
104     q2_a_sub_q0 = __lsx_vabsd_bu(q2_src, q0_src);                            \
105     p3_a_sub_p0 = __lsx_vabsd_bu(p3_src, p0_src);                            \
106     q3_a_sub_q0 = __lsx_vabsd_bu(q3_src, q0_src);                            \
107                                                                              \
108     p2_a_sub_p0 = __lsx_vmax_bu(p2_a_sub_p0, q2_a_sub_q0);                   \
109     flat_dst = __lsx_vmax_bu(p2_a_sub_p0, flat_dst);                         \
110     p3_a_sub_p0 = __lsx_vmax_bu(p3_a_sub_p0, q3_a_sub_q0);                   \
111     flat_dst = __lsx_vmax_bu(p3_a_sub_p0, flat_dst);                         \
112                                                                              \
113     flat_dst = __lsx_vslt_bu(f_tmp, flat_dst);                               \
114     flat_dst = __lsx_vxori_b(flat_dst, 0xff);                                \
115     flat_dst = flat_dst & mask;                                              \
116 }
117 
118 #define VP9_FLAT5(p7_src, p6_src, p5_src, p4_src, p0_src, q0_src, q4_src, \
119                   q5_src, q6_src, q7_src, flat_src, flat2_dst)            \
120 {                                                                         \
121     __m128i f_tmp = __lsx_vldi(1);                                        \
122     __m128i p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;           \
123     __m128i p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;           \
124                                                                           \
125     p4_a_sub_p0 = __lsx_vabsd_bu(p4_src, p0_src);                         \
126     q4_a_sub_q0 = __lsx_vabsd_bu(q4_src, q0_src);                         \
127     p5_a_sub_p0 = __lsx_vabsd_bu(p5_src, p0_src);                         \
128     q5_a_sub_q0 = __lsx_vabsd_bu(q5_src, q0_src);                         \
129     p6_a_sub_p0 = __lsx_vabsd_bu(p6_src, p0_src);                         \
130     q6_a_sub_q0 = __lsx_vabsd_bu(q6_src, q0_src);                         \
131     p7_a_sub_p0 = __lsx_vabsd_bu(p7_src, p0_src);                         \
132     q7_a_sub_q0 = __lsx_vabsd_bu(q7_src, q0_src);                         \
133                                                                           \
134     p4_a_sub_p0 = __lsx_vmax_bu(p4_a_sub_p0, q4_a_sub_q0);                \
135     flat2_dst = __lsx_vmax_bu(p5_a_sub_p0, q5_a_sub_q0);                  \
136     flat2_dst = __lsx_vmax_bu(p4_a_sub_p0, flat2_dst);                    \
137     p6_a_sub_p0 = __lsx_vmax_bu(p6_a_sub_p0, q6_a_sub_q0);                \
138     flat2_dst = __lsx_vmax_bu(p6_a_sub_p0, flat2_dst);                    \
139     p7_a_sub_p0 = __lsx_vmax_bu(p7_a_sub_p0, q7_a_sub_q0);                \
140     flat2_dst = __lsx_vmax_bu(p7_a_sub_p0, flat2_dst);                    \
141                                                                           \
142     flat2_dst = __lsx_vslt_bu(f_tmp, flat2_dst);                          \
143     flat2_dst = __lsx_vxori_b(flat2_dst, 0xff);                           \
144     flat2_dst = flat2_dst & flat_src;                                     \
145 }
146 
147 #define VP9_FILTER8(p3_src, p2_src, p1_src, p0_src,            \
148                     q0_src, q1_src, q2_src, q3_src,            \
149                     p2_filt8_dst, p1_filt8_dst, p0_filt8_dst,  \
150                     q0_filt8_dst, q1_filt8_dst, q2_filt8_dst)  \
151 {                                                              \
152     __m128i tmp0, tmp1, tmp2;                                  \
153                                                                \
154     tmp2 = __lsx_vadd_h(p2_src, p1_src);                       \
155     tmp2 = __lsx_vadd_h(tmp2, p0_src);                         \
156     tmp0 = __lsx_vslli_h(p3_src, 1);                           \
157                                                                \
158     tmp0 = __lsx_vadd_h(tmp0, tmp2);                           \
159     tmp0 = __lsx_vadd_h(tmp0, q0_src);                         \
160     tmp1 = __lsx_vadd_h(tmp0, p3_src);                         \
161     tmp1 = __lsx_vadd_h(tmp1, p2_src);                         \
162     p2_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
163                                                                \
164     tmp1 = __lsx_vadd_h(tmp0, p1_src);                         \
165     tmp1 = __lsx_vadd_h(tmp1, q1_src);                         \
166     p1_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
167                                                                \
168     tmp1 = __lsx_vadd_h(q2_src, q1_src);                       \
169     tmp1 = __lsx_vadd_h(tmp1, q0_src);                         \
170     tmp2 = __lsx_vadd_h(tmp2, tmp1);                           \
171     tmp0 = __lsx_vadd_h(tmp2, p0_src);                         \
172     tmp0 = __lsx_vadd_h(tmp0, p3_src);                         \
173     p0_filt8_dst = __lsx_vsrari_h(tmp0, 3);                    \
174                                                                \
175     tmp0 = __lsx_vadd_h(q2_src, q3_src);                       \
176     tmp0 = __lsx_vadd_h(tmp0, p0_src);                         \
177     tmp0 = __lsx_vadd_h(tmp0, tmp1);                           \
178     tmp1 = __lsx_vadd_h(q3_src, q3_src);                       \
179     tmp1 = __lsx_vadd_h(tmp1, tmp0);                           \
180     q2_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
181                                                                \
182     tmp0 = __lsx_vadd_h(tmp2, q3_src);                         \
183     tmp1 = __lsx_vadd_h(tmp0, q0_src);                         \
184     q0_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
185                                                                \
186     tmp1 = __lsx_vsub_h(tmp0, p2_src);                         \
187     tmp0 = __lsx_vadd_h(q1_src, q3_src);                       \
188     tmp1 = __lsx_vadd_h(tmp0, tmp1);                           \
189     q1_filt8_dst = __lsx_vsrari_h(tmp1, 3);                    \
190 }
191 
192 #define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,        \
193                      q2_src, q3_src, limit_src, b_limit_src, thresh_src,    \
194                      hev_dst, mask_dst, flat_dst)                           \
195 {                                                                           \
196     __m128i p3_asub_p2_tmp, p2_asub_p1_tmp, p1_asub_p0_tmp, q1_asub_q0_tmp; \
197     __m128i p1_asub_q1_tmp, p0_asub_q0_tmp, q3_asub_q2_tmp, q2_asub_q1_tmp; \
198                                                                             \
199     /* absolute subtraction of pixel values */                              \
200     p3_asub_p2_tmp = __lsx_vabsd_bu(p3_src, p2_src);                        \
201     p2_asub_p1_tmp = __lsx_vabsd_bu(p2_src, p1_src);                        \
202     p1_asub_p0_tmp = __lsx_vabsd_bu(p1_src, p0_src);                        \
203     q1_asub_q0_tmp = __lsx_vabsd_bu(q1_src, q0_src);                        \
204     q2_asub_q1_tmp = __lsx_vabsd_bu(q2_src, q1_src);                        \
205     q3_asub_q2_tmp = __lsx_vabsd_bu(q3_src, q2_src);                        \
206     p0_asub_q0_tmp = __lsx_vabsd_bu(p0_src, q0_src);                        \
207     p1_asub_q1_tmp = __lsx_vabsd_bu(p1_src, q1_src);                        \
208                                                                             \
209     /* calculation of hev */                                                \
210     flat_dst = __lsx_vmax_bu(p1_asub_p0_tmp, q1_asub_q0_tmp);               \
211     hev_dst = __lsx_vslt_bu(thresh_src, flat_dst);                          \
212                                                                             \
213     /* calculation of mask */                                               \
214     p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p0_asub_q0_tmp);        \
215     p1_asub_q1_tmp = __lsx_vsrli_b(p1_asub_q1_tmp, 1);                      \
216     p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p1_asub_q1_tmp);        \
217                                                                             \
218     mask_dst = __lsx_vslt_bu(b_limit_src, p0_asub_q0_tmp);                  \
219     mask_dst = __lsx_vmax_bu(flat_dst, mask_dst);                           \
220     p3_asub_p2_tmp = __lsx_vmax_bu(p3_asub_p2_tmp, p2_asub_p1_tmp);         \
221     mask_dst = __lsx_vmax_bu(p3_asub_p2_tmp, mask_dst);                     \
222     q2_asub_q1_tmp = __lsx_vmax_bu(q2_asub_q1_tmp, q3_asub_q2_tmp);         \
223     mask_dst = __lsx_vmax_bu(q2_asub_q1_tmp, mask_dst);                     \
224                                                                             \
225     mask_dst = __lsx_vslt_bu(limit_src, mask_dst);                          \
226     mask_dst = __lsx_vxori_b(mask_dst, 0xff);                               \
227 }
228 
ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)229 void ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride,
230                               int32_t b_limit_ptr,
231                               int32_t limit_ptr,
232                               int32_t thresh_ptr)
233 {
234     ptrdiff_t stride2 = stride << 1;
235     ptrdiff_t stride3 = stride2 + stride;
236     ptrdiff_t stride4 = stride2 << 1;
237     __m128i mask, hev, flat, thresh, b_limit, limit;
238     __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
239 
240     DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
241               dst, -stride, p3, p2, p1, p0);
242     q0 = __lsx_vld(dst, 0);
243     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
244     q3 = __lsx_vldx(dst, stride3);
245 
246     thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
247     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
248     limit   = __lsx_vreplgr2vr_b(limit_ptr);
249 
250     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
251                  hev, mask, flat);
252 
253     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
254                        q1_out);
255 
256     __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
257     __lsx_vstelm_d(p0_out, dst -  stride, 0, 0);
258     __lsx_vstelm_d(q0_out, dst          , 0, 0);
259     __lsx_vstelm_d(q1_out, dst +  stride, 0, 0);
260 }
261 
ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)262 void ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride,
263                                 int32_t b_limit_ptr,
264                                 int32_t limit_ptr,
265                                 int32_t thresh_ptr)
266 {
267     ptrdiff_t stride2 = stride << 1;
268     ptrdiff_t stride3 = stride2 + stride;
269     ptrdiff_t stride4 = stride2 << 1;
270     __m128i mask, hev, flat, thresh0, b_limit0;
271     __m128i limit0, thresh1, b_limit1, limit1;
272     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
273 
274     DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
275               dst, -stride, p3, p2, p1, p0);
276     q0 = __lsx_vld(dst, 0);
277     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
278     q3 = __lsx_vldx(dst, stride3);
279 
280     thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
281     thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
282     thresh0 = __lsx_vilvl_d(thresh1, thresh0);
283 
284     b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
285     b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
286     b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
287 
288     limit0 = __lsx_vreplgr2vr_b(limit_ptr);
289     limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
290     limit0 = __lsx_vilvl_d(limit1, limit0);
291 
292     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
293                  hev, mask, flat);
294     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
295 
296     __lsx_vst(p1, dst - stride2, 0);
297     __lsx_vst(p0, dst -  stride, 0);
298     __lsx_vst(q0, dst          , 0);
299     __lsx_vst(q1, dst +  stride, 0);
300 }
301 
ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)302 void ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride,
303                               int32_t b_limit_ptr,
304                               int32_t limit_ptr,
305                               int32_t thresh_ptr)
306 {
307     ptrdiff_t stride2 = stride << 1;
308     ptrdiff_t stride3 = stride2 + stride;
309     ptrdiff_t stride4 = stride2 << 1;
310     __m128i mask, hev, flat, thresh, b_limit, limit;
311     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
312     __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
313     __m128i p2_filter8, p1_filter8, p0_filter8;
314     __m128i q0_filter8, q1_filter8, q2_filter8;
315     __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
316     __m128i zero = __lsx_vldi(0);
317 
318     DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
319               dst, -stride, p3, p2, p1, p0);
320     q0 = __lsx_vld(dst, 0);
321     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
322     q3 = __lsx_vldx(dst, stride3);
323 
324     thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
325     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
326     limit   = __lsx_vreplgr2vr_b(limit_ptr);
327 
328     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
329                  hev, mask, flat);
330     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
331     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
332                        q1_out);
333 
334     flat = __lsx_vilvl_d(zero, flat);
335 
336     /* if flat is zero for all pixels, then no need to calculate other filter */
337     if (__lsx_bz_v(flat)) {
338         __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
339         __lsx_vstelm_d(p0_out, dst -  stride, 0, 0);
340         __lsx_vstelm_d(q0_out, dst          , 0, 0);
341         __lsx_vstelm_d(q1_out, dst +  stride, 0, 0);
342     } else {
343         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
344                   p3_l, p2_l, p1_l, p0_l);
345         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
346                   q0_l, q1_l, q2_l, q3_l);
347         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
348                     p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
349 
350         /* convert 16 bit output data into 8 bit */
351         DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8,
352                   zero, p0_filter8, zero, q0_filter8, p2_filter8,
353                   p1_filter8, p0_filter8, q0_filter8);
354         DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8,
355                   q1_filter8, q2_filter8);
356 
357         /* store pixel values */
358         p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
359         p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
360         p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
361         q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
362         q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
363         q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
364 
365         __lsx_vstelm_d(p2_out, dst - stride3, 0, 0);
366         __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
367         __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
368         __lsx_vstelm_d(q0_out, dst, 0, 0);
369         __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
370         __lsx_vstelm_d(q2_out, dst + stride2, 0, 0);
371     }
372 }
373 
ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)374 void ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride,
375                                 int32_t b_limit_ptr,
376                                 int32_t limit_ptr,
377                                 int32_t thresh_ptr)
378 {
379     ptrdiff_t stride2 = stride << 1;
380     ptrdiff_t stride3 = stride2 + stride;
381     ptrdiff_t stride4 = stride2 << 1;
382     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
383     __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
384     __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
385     __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
386     __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
387     __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
388     __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
389     __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
390     __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
391     __m128i zero = __lsx_vldi(0);
392 
393     /* load vector elements */
394     DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
395               dst, -stride, p3, p2, p1, p0);
396     q0 = __lsx_vld(dst, 0);
397     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
398     q3 = __lsx_vldx(dst, stride3);
399 
400     thresh = __lsx_vreplgr2vr_b(thresh_ptr);
401     tmp    = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
402     thresh = __lsx_vilvl_d(tmp, thresh);
403 
404     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
405     tmp     = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
406     b_limit = __lsx_vilvl_d(tmp, b_limit);
407 
408     limit = __lsx_vreplgr2vr_b(limit_ptr);
409     tmp   = __lsx_vreplgr2vr_b(limit_ptr >> 8);
410     limit = __lsx_vilvl_d(tmp, limit);
411 
412     /* mask and hev */
413     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
414                  hev, mask, flat);
415     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
416     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
417                        q1_out);
418 
419     /* if flat is zero for all pixels, then no need to calculate other filter */
420     if (__lsx_bz_v(flat)) {
421         __lsx_vst(p1_out, dst - stride2, 0);
422         __lsx_vst(p0_out, dst - stride, 0);
423         __lsx_vst(q0_out, dst, 0);
424         __lsx_vst(q1_out, dst + stride, 0);
425     } else {
426         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
427                   p3_l, p2_l, p1_l, p0_l);
428         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
429                   q0_l, q1_l, q2_l, q3_l);
430         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
431                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
432 
433         DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
434                   p3_h, p2_h, p1_h, p0_h);
435         DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
436                   q0_h, q1_h, q2_h, q3_h);
437         VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
438                     p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
439 
440         /* convert 16 bit output data into 8 bit */
441         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
442                   p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
443                   p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
444         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
445                   q2_filt8_l, q1_filt8_l, q2_filt8_l);
446 
447         /* store pixel values */
448         p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
449         p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
450         p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
451         q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
452         q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
453         q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
454 
455 
456         __lsx_vstx(p2_out, dst, -stride3);
457         __lsx_vstx(p1_out, dst, -stride2);
458         __lsx_vstx(p0_out, dst, -stride);
459         __lsx_vst(q0_out, dst, 0);
460         __lsx_vstx(q1_out, dst, stride);
461         __lsx_vstx(q2_out, dst, stride2);
462     }
463 }
464 
ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)465 void ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride,
466                                 int32_t b_limit_ptr,
467                                 int32_t limit_ptr,
468                                 int32_t thresh_ptr)
469 {
470     ptrdiff_t stride2 = stride << 1;
471     ptrdiff_t stride3 = stride2 + stride;
472     ptrdiff_t stride4 = stride2 << 1;
473     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
474     __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
475     __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
476     __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
477     __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
478     __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
479     __m128i zero = __lsx_vldi(0);
480 
481     /* load vector elements */
482     DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
483               dst, -stride, p3, p2, p1, p0);
484     q0 = __lsx_vld(dst, 0);
485     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
486     q3 = __lsx_vldx(dst, stride3);
487 
488     thresh = __lsx_vreplgr2vr_b(thresh_ptr);
489     tmp    = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
490     thresh = __lsx_vilvl_d(tmp, thresh);
491 
492     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
493     tmp     = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
494     b_limit = __lsx_vilvl_d(tmp, b_limit);
495 
496     limit = __lsx_vreplgr2vr_b(limit_ptr);
497     tmp   = __lsx_vreplgr2vr_b(limit_ptr >> 8);
498     limit = __lsx_vilvl_d(tmp, limit);
499 
500     /* mask and hev */
501     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
502                  hev, mask, flat);
503     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
504     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
505                        q1_out);
506 
507     flat = __lsx_vilvl_d(zero, flat);
508 
509     /* if flat is zero for all pixels, then no need to calculate other filter */
510     if (__lsx_bz_v(flat)) {
511         __lsx_vstx(p1_out, dst, -stride2);
512         __lsx_vstx(p0_out, dst, -stride);
513         __lsx_vst(q0_out, dst, 0);
514         __lsx_vstx(q1_out, dst, stride);
515     } else {
516         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
517                   p3_l, p2_l, p1_l, p0_l);
518         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
519                   q0_l, q1_l, q2_l, q3_l);
520         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
521                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
522 
523         /* convert 16 bit output data into 8 bit */
524         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
525                   p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
526                   p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
527         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
528                   q2_filt8_l, q1_filt8_l, q2_filt8_l);
529 
530         /* store pixel values */
531         p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
532         p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
533         p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
534         q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
535         q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
536         q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
537 
538         __lsx_vstx(p2_out, dst, -stride3);
539         __lsx_vstx(p1_out, dst, -stride2);
540         __lsx_vstx(p0_out, dst, -stride);
541         __lsx_vst(q0_out, dst, 0);
542         __lsx_vstx(q1_out, dst, stride);
543         __lsx_vstx(q2_out, dst, stride2);
544     }
545 }
546 
ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)547 void ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride,
548                                 int32_t b_limit_ptr,
549                                 int32_t limit_ptr,
550                                 int32_t thresh_ptr)
551 {
552     ptrdiff_t stride2 = stride << 1;
553     ptrdiff_t stride3 = stride2 + stride;
554     ptrdiff_t stride4 = stride2 << 1;
555     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
556     __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
557     __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
558     __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
559     __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
560     __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
561     __m128i zero = { 0 };
562 
563     /* load vector elements */
564     DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
565               dst, -stride, p3, p2, p1, p0);
566     q0 = __lsx_vld(dst, 0);
567     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
568     q3 = __lsx_vldx(dst, stride3);
569 
570     thresh = __lsx_vreplgr2vr_b(thresh_ptr);
571     tmp    = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
572     thresh = __lsx_vilvl_d(tmp, thresh);
573 
574     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
575     tmp     = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
576     b_limit = __lsx_vilvl_d(tmp, b_limit);
577 
578     limit = __lsx_vreplgr2vr_b(limit_ptr);
579     tmp   = __lsx_vreplgr2vr_b(limit_ptr >> 8);
580     limit = __lsx_vilvl_d(tmp, limit);
581 
582     /* mask and hev */
583     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
584                  hev, mask, flat);
585     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
586     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
587                        q1_out);
588 
589     flat = __lsx_vilvh_d(flat, zero);
590 
591     /* if flat is zero for all pixels, then no need to calculate other filter */
592     if (__lsx_bz_v(flat)) {
593         __lsx_vstx(p1_out, dst, -stride2);
594         __lsx_vstx(p0_out, dst, -stride);
595         __lsx_vst(q0_out, dst, 0);
596         __lsx_vstx(q1_out, dst, stride);
597     } else {
598         DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
599                   p3_h, p2_h, p1_h, p0_h);
600         DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
601                   q0_h, q1_h, q2_h, q3_h);
602         VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
603                     p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
604 
605         /* convert 16 bit output data into 8 bit */
606         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
607                   p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
608                   p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
609         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
610                   q2_filt8_h, q1_filt8_h, q2_filt8_h);
611 
612         /* store pixel values */
613         p2_out = __lsx_vbitsel_v(p2, p2_filt8_h, flat);
614         p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat);
615         p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat);
616         q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat);
617         q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat);
618         q2_out = __lsx_vbitsel_v(q2, q2_filt8_h, flat);
619 
620         __lsx_vstx(p2_out, dst, -stride3);
621         __lsx_vstx(p1_out, dst, -stride2);
622         __lsx_vstx(p0_out, dst, -stride);
623         __lsx_vst(q0_out, dst, 0);
624         __lsx_vstx(q1_out, dst, stride);
625         __lsx_vstx(q2_out, dst, stride2);
626     }
627 }
628 
vp9_hz_lpf_t4_and_t8_16w(uint8_t *dst, ptrdiff_t stride, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)629 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *dst, ptrdiff_t stride,
630                                         uint8_t *filter48,
631                                         int32_t b_limit_ptr,
632                                         int32_t limit_ptr,
633                                         int32_t thresh_ptr)
634 {
635     ptrdiff_t stride2 = stride << 1;
636     ptrdiff_t stride3 = stride2 + stride;
637     ptrdiff_t stride4 = stride2 << 1;
638     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
639     __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
640     __m128i flat, mask, hev, thresh, b_limit, limit;
641     __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
642     __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
643     __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
644     __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
645     __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
646     __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
647     __m128i zero = __lsx_vldi(0);
648 
649     /* load vector elements */
650     DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
651               dst, -stride, p3, p2, p1, p0);
652     q0 = __lsx_vld(dst, 0);
653     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
654     q3 = __lsx_vldx(dst, stride3);
655 
656     thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
657     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
658     limit   = __lsx_vreplgr2vr_b(limit_ptr);
659 
660     /* mask and hev */
661     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
662                  hev, mask, flat);
663     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
664     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
665                        q1_out);
666 
667     /* if flat is zero for all pixels, then no need to calculate other filter */
668     if (__lsx_bz_v(flat)) {
669         __lsx_vstx(p1_out, dst, -stride2);
670         __lsx_vstx(p0_out, dst, -stride);
671         __lsx_vst(q0_out, dst, 0);
672         __lsx_vstx(q1_out, dst, stride);
673         return 1;
674     } else {
675         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
676                   p3_l, p2_l, p1_l, p0_l);
677         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
678                   q0_l, q1_l, q2_l, q3_l);
679         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
680                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
681 
682         DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
683                   p3_h, p2_h, p1_h, p0_h);
684         DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
685                   q0_h, q1_h, q2_h, q3_h);
686         VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
687                     p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
688 
689         /* convert 16 bit output data into 8 bit */
690         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
691                   p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
692                   p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
693         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
694                   q2_filt8_l, q1_filt8_l, q2_filt8_l);
695 
696         /* store pixel values */
697         p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
698         p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
699         p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
700         q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
701         q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
702         q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
703 
704         __lsx_vst(p2_out, filter48, 0);
705         __lsx_vst(p1_out, filter48, 16);
706         __lsx_vst(p0_out, filter48, 32);
707         __lsx_vst(q0_out, filter48, 48);
708         __lsx_vst(q1_out, filter48, 64);
709         __lsx_vst(q2_out, filter48, 80);
710         __lsx_vst(flat, filter48, 96);
711 
712         return 0;
713     }
714 }
715 
vp9_hz_lpf_t16_16w(uint8_t *dst, ptrdiff_t stride, uint8_t *filter48)716 static void vp9_hz_lpf_t16_16w(uint8_t *dst, ptrdiff_t stride,
717                                uint8_t *filter48)
718 {
719     ptrdiff_t stride2 = stride << 1;
720     ptrdiff_t stride3 = stride2 + stride;
721     ptrdiff_t stride4 = stride2 << 1;
722     uint8_t *dst_tmp = dst - stride4;
723     uint8_t *dst_tmp1 = dst + stride4;
724     __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
725     __m128i flat, flat2, filter8;
726     __m128i zero = __lsx_vldi(0);
727     __m128i out_h, out_l;
728     v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
729     v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
730     v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
731     v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
732     v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
733     v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
734     v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
735     v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
736     v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
737 
738     flat = __lsx_vld(filter48, 96);
739 
740     DUP4_ARG2(__lsx_vldx, dst_tmp, -stride4, dst_tmp, -stride3, dst_tmp,
741               -stride2, dst_tmp, -stride, p7, p6, p5, p4);
742     p3 = __lsx_vld(dst_tmp, 0);
743     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
744     p0 = __lsx_vldx(dst_tmp, stride3);
745 
746     q0 = __lsx_vld(dst, 0);
747     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
748     q3 = __lsx_vldx(dst, stride3);
749 
750     q4 = __lsx_vld(dst_tmp1, 0);
751     DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
752     q7 = __lsx_vldx(dst_tmp1, stride3);
753     VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
754 
755     /* if flat2 is zero for all pixels, then no need to calculate other filter */
756     if (__lsx_bz_v(flat2)) {
757         DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48,
758                   48, p2, p1, p0, q0);
759         DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
760 
761         __lsx_vstx(p2, dst, -stride3);
762         __lsx_vstx(p1, dst, -stride2);
763         __lsx_vstx(p0, dst, -stride);
764         __lsx_vst(q0, dst, 0);
765         __lsx_vstx(q1, dst, stride);
766         __lsx_vstx(q2, dst, stride2);
767     } else {
768         dst = dst_tmp - stride3;
769 
770         p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
771         p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
772         p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
773         p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
774         p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
775         p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
776         p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
777         p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
778 
779         q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
780 
781         tmp0_l = p7_l_in << 3;
782         tmp0_l -= p7_l_in;
783         tmp0_l += p6_l_in;
784         tmp0_l += q0_l_in;
785         tmp1_l = p6_l_in + p5_l_in;
786         tmp1_l += p4_l_in;
787         tmp1_l += p3_l_in;
788         tmp1_l += p2_l_in;
789         tmp1_l += p1_l_in;
790         tmp1_l += p0_l_in;
791         tmp1_l += tmp0_l;
792 
793         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
794 
795         p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
796         p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
797         p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
798         p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
799 
800         p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
801         p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
802         p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
803         p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
804         q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
805 
806         tmp0_h = p7_h_in << 3;
807         tmp0_h -= p7_h_in;
808         tmp0_h += p6_h_in;
809         tmp0_h += q0_h_in;
810         tmp1_h = p6_h_in + p5_h_in;
811         tmp1_h += p4_h_in;
812         tmp1_h += p3_h_in;
813         tmp1_h += p2_h_in;
814         tmp1_h += p1_h_in;
815         tmp1_h += p0_h_in;
816         tmp1_h += tmp0_h;
817 
818         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
819 
820         out_l = __lsx_vpickev_b(out_h, out_l);
821         p6 = __lsx_vbitsel_v(p6, out_l, flat2);
822         __lsx_vst(p6, dst, 0);
823         dst += stride;
824 
825         /* p5 */
826         q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
827         tmp0_l = p5_l_in - p6_l_in;
828         tmp0_l += q1_l_in;
829         tmp0_l -= p7_l_in;
830         tmp1_l += tmp0_l;
831         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
832 
833         q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
834         tmp0_h = p5_h_in - p6_h_in;
835         tmp0_h += q1_h_in;
836         tmp0_h -= p7_h_in;
837         tmp1_h += tmp0_h;
838         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
839 
840         out_l = __lsx_vpickev_b(out_h, out_l);
841         p5 = __lsx_vbitsel_v(p5, out_l, flat2);
842         __lsx_vst(p5, dst, 0);
843         dst += stride;
844 
845         /* p4 */
846         q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
847         tmp0_l = p4_l_in - p5_l_in;
848         tmp0_l += q2_l_in;
849         tmp0_l -= p7_l_in;
850         tmp1_l += tmp0_l;
851         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
852 
853         q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
854         tmp0_h = p4_h_in - p5_h_in;
855         tmp0_h += q2_h_in;
856         tmp0_h -= p7_h_in;
857         tmp1_h += tmp0_h;
858         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
859 
860         out_l = __lsx_vpickev_b(out_h, out_l);
861         p4 = __lsx_vbitsel_v(p4, out_l, flat2);
862         __lsx_vst(p4, dst, 0);
863         dst += stride;
864 
865         /* p3 */
866         q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
867         tmp0_l = p3_l_in - p4_l_in;
868         tmp0_l += q3_l_in;
869         tmp0_l -= p7_l_in;
870         tmp1_l += tmp0_l;
871         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
872 
873         q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
874         tmp0_h = p3_h_in - p4_h_in;
875         tmp0_h += q3_h_in;
876         tmp0_h -= p7_h_in;
877         tmp1_h += tmp0_h;
878         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
879 
880         out_l = __lsx_vpickev_b(out_h, out_l);
881         p3 = __lsx_vbitsel_v(p3, out_l, flat2);
882         __lsx_vst(p3, dst, 0);
883         dst += stride;
884 
885         /* p2 */
886         q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
887         filter8 = __lsx_vld(filter48, 0);
888         tmp0_l = p2_l_in - p3_l_in;
889         tmp0_l += q4_l_in;
890         tmp0_l -= p7_l_in;
891         tmp1_l += tmp0_l;
892         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
893 
894         q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
895         tmp0_h = p2_h_in - p3_h_in;
896         tmp0_h += q4_h_in;
897         tmp0_h -= p7_h_in;
898         tmp1_h += tmp0_h;
899         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
900 
901         out_l = __lsx_vpickev_b(out_h, out_l);
902         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
903         __lsx_vst(filter8, dst, 0);
904         dst += stride;
905 
906         /* p1 */
907         q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
908         filter8 = __lsx_vld(filter48, 16);
909         tmp0_l = p1_l_in - p2_l_in;
910         tmp0_l += q5_l_in;
911         tmp0_l -= p7_l_in;
912         tmp1_l += tmp0_l;
913         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
914 
915         q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
916         tmp0_h = p1_h_in - p2_h_in;
917         tmp0_h += q5_h_in;
918         tmp0_h -= p7_h_in;
919         tmp1_h += tmp0_h;
920         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
921 
922         out_l = __lsx_vpickev_b(out_h, out_l);
923         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
924         __lsx_vst(filter8, dst, 0);
925         dst += stride;
926 
927         /* p0 */
928         q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
929         filter8 = __lsx_vld(filter48, 32);
930         tmp0_l = p0_l_in - p1_l_in;
931         tmp0_l += q6_l_in;
932         tmp0_l -= p7_l_in;
933         tmp1_l += tmp0_l;
934         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
935 
936         q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
937         tmp0_h = p0_h_in - p1_h_in;
938         tmp0_h += q6_h_in;
939         tmp0_h -= p7_h_in;
940         tmp1_h += tmp0_h;
941         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
942 
943         out_l = __lsx_vpickev_b(out_h, out_l);
944         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
945         __lsx_vst(filter8, dst, 0);
946         dst += stride;
947 
948         /* q0 */
949         q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
950         filter8 = __lsx_vld(filter48, 48);
951         tmp0_l = q7_l_in - p0_l_in;
952         tmp0_l += q0_l_in;
953         tmp0_l -= p7_l_in;
954         tmp1_l += tmp0_l;
955         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
956 
957         q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
958         tmp0_h = q7_h_in - p0_h_in;
959         tmp0_h += q0_h_in;
960         tmp0_h -= p7_h_in;
961         tmp1_h += tmp0_h;
962         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
963 
964         out_l = __lsx_vpickev_b(out_h, out_l);
965         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
966         __lsx_vst(filter8, dst, 0);
967         dst += stride;
968 
969         /* q1 */
970         filter8 = __lsx_vld(filter48, 64);
971         tmp0_l = q7_l_in - q0_l_in;
972         tmp0_l += q1_l_in;
973         tmp0_l -= p6_l_in;
974         tmp1_l += tmp0_l;
975         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
976 
977         tmp0_h = q7_h_in - q0_h_in;
978         tmp0_h += q1_h_in;
979         tmp0_h -= p6_h_in;
980         tmp1_h += tmp0_h;
981         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
982 
983         out_l = __lsx_vpickev_b(out_h, out_l);
984         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
985         __lsx_vst(filter8, dst, 0);
986         dst += stride;
987 
988         /* q2 */
989         filter8 = __lsx_vld(filter48, 80);
990         tmp0_l = q7_l_in - q1_l_in;
991         tmp0_l += q2_l_in;
992         tmp0_l -= p5_l_in;
993         tmp1_l += tmp0_l;
994         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
995 
996         tmp0_h = q7_h_in - q1_h_in;
997         tmp0_h += q2_h_in;
998         tmp0_h -= p5_h_in;
999         tmp1_h += tmp0_h;
1000         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1001 
1002         out_l = __lsx_vpickev_b(out_h, out_l);
1003         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
1004         __lsx_vst(filter8, dst, 0);
1005         dst += stride;
1006 
1007         /* q3 */
1008         tmp0_l = q7_l_in - q2_l_in;
1009         tmp0_l += q3_l_in;
1010         tmp0_l -= p4_l_in;
1011         tmp1_l += tmp0_l;
1012         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1013 
1014         tmp0_h = q7_h_in - q2_h_in;
1015         tmp0_h += q3_h_in;
1016         tmp0_h -= p4_h_in;
1017         tmp1_h += tmp0_h;
1018         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1019 
1020         out_l = __lsx_vpickev_b(out_h, out_l);
1021         q3 = __lsx_vbitsel_v(q3, out_l, flat2);
1022         __lsx_vst(q3, dst, 0);
1023         dst += stride;
1024 
1025         /* q4 */
1026         tmp0_l = q7_l_in - q3_l_in;
1027         tmp0_l += q4_l_in;
1028         tmp0_l -= p3_l_in;
1029         tmp1_l += tmp0_l;
1030         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1031 
1032         tmp0_h = q7_h_in - q3_h_in;
1033         tmp0_h += q4_h_in;
1034         tmp0_h -= p3_h_in;
1035         tmp1_h += tmp0_h;
1036         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1037 
1038         out_l = __lsx_vpickev_b(out_h, out_l);
1039         q4 = __lsx_vbitsel_v(q4, out_l, flat2);
1040         __lsx_vst(q4, dst, 0);
1041         dst += stride;
1042 
1043         /* q5 */
1044         tmp0_l = q7_l_in - q4_l_in;
1045         tmp0_l += q5_l_in;
1046         tmp0_l -= p2_l_in;
1047         tmp1_l += tmp0_l;
1048         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1049 
1050         tmp0_h = q7_h_in - q4_h_in;
1051         tmp0_h += q5_h_in;
1052         tmp0_h -= p2_h_in;
1053         tmp1_h += tmp0_h;
1054         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1055 
1056         out_l = __lsx_vpickev_b(out_h, out_l);
1057         q5 = __lsx_vbitsel_v(q5, out_l, flat2);
1058         __lsx_vst(q5, dst, 0);
1059         dst += stride;
1060 
1061         /* q6 */
1062         tmp0_l = q7_l_in - q5_l_in;
1063         tmp0_l += q6_l_in;
1064         tmp0_l -= p1_l_in;
1065         tmp1_l += tmp0_l;
1066         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1067 
1068         tmp0_h = q7_h_in - q5_h_in;
1069         tmp0_h += q6_h_in;
1070         tmp0_h -= p1_h_in;
1071         tmp1_h += tmp0_h;
1072         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1073 
1074         out_l = __lsx_vpickev_b(out_h, out_l);
1075         q6 = __lsx_vbitsel_v(q6, out_l, flat2);
1076         __lsx_vst(q6, dst, 0);
1077     }
1078 }
1079 
ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1080 void ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride,
1081                                 int32_t b_limit_ptr,
1082                                 int32_t limit_ptr,
1083                                 int32_t thresh_ptr)
1084 {
1085     uint8_t filter48[16 * 8] __attribute__ ((aligned(16)));
1086     uint8_t early_exit = 0;
1087 
1088     early_exit = vp9_hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0],
1089                                           b_limit_ptr, limit_ptr, thresh_ptr);
1090 
1091     if (0 == early_exit) {
1092         vp9_hz_lpf_t16_16w(dst, stride, filter48);
1093     }
1094 }
1095 
ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1096 void ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride,
1097                                int32_t b_limit_ptr,
1098                                int32_t limit_ptr,
1099                                int32_t thresh_ptr)
1100 {
1101     ptrdiff_t stride2 = stride << 1;
1102     ptrdiff_t stride3 = stride2 + stride;
1103     ptrdiff_t stride4 = stride2 << 1;
1104     uint8_t *dst_tmp = dst - stride4;
1105     uint8_t *dst_tmp1 = dst + stride4;
1106     __m128i zero = __lsx_vldi(0);
1107     __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
1108     __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
1109     __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1110     __m128i p0_filter16, p1_filter16;
1111     __m128i p2_filter8, p1_filter8, p0_filter8;
1112     __m128i q0_filter8, q1_filter8, q2_filter8;
1113     __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
1114     __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
1115     __m128i tmp0, tmp1, tmp2;
1116 
1117     /* load vector elements */
1118     DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
1119               dst, -stride, p3, p2, p1, p0);
1120     q0 = __lsx_vld(dst, 0);
1121     DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
1122     q3 = __lsx_vldx(dst, stride3);
1123 
1124     thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
1125     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1126     limit   = __lsx_vreplgr2vr_b(limit_ptr);
1127 
1128     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1129                  hev, mask, flat);
1130     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1131     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1132                        q1_out);
1133 
1134     flat = __lsx_vilvl_d(zero, flat);
1135 
1136     /* if flat is zero for all pixels, then no need to calculate other filter */
1137     if (__lsx_bz_v(flat)) {
1138         __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
1139         __lsx_vstelm_d(p0_out, dst -   stride, 0, 0);
1140         __lsx_vstelm_d(q0_out, dst           , 0, 0);
1141         __lsx_vstelm_d(q1_out, dst +   stride, 0, 0);
1142     } else {
1143         /* convert 8 bit input data into 16 bit */
1144         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1145                   p3_l, p2_l, p1_l, p0_l);
1146         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1147                   q0_l, q1_l, q2_l, q3_l);
1148         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l,
1149                     p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1150                     q1_filter8, q2_filter8);
1151 
1152         /* convert 16 bit output data into 8 bit */
1153         DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8,
1154                   zero, p0_filter8, zero, q0_filter8, p2_filter8,
1155                   p1_filter8, p0_filter8, q0_filter8);
1156         DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8,
1157                   q1_filter8, q2_filter8);
1158 
1159         /* store pixel values */
1160         p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
1161         p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
1162         p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
1163         q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
1164         q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
1165         q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
1166 
1167         /* load 16 vector elements */
1168         DUP4_ARG2(__lsx_vld, dst_tmp - stride4, 0, dst_tmp - stride3, 0,
1169                   dst_tmp - stride2, 0, dst_tmp - stride, 0, p7, p6, p5, p4);
1170         DUP4_ARG2(__lsx_vld, dst_tmp1, 0, dst_tmp1 + stride, 0,
1171                 dst_tmp1 + stride2, 0, dst_tmp1 + stride3, 0, q4, q5, q6, q7);
1172 
1173         VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1174 
1175         /* if flat2 is zero for all pixels, then no need to calculate other filter */
1176         if (__lsx_bz_v(flat2)) {
1177             dst -= stride3;
1178             __lsx_vstelm_d(p2_out, dst, 0, 0);
1179             dst += stride;
1180             __lsx_vstelm_d(p1_out, dst, 0, 0);
1181             dst += stride;
1182             __lsx_vstelm_d(p0_out, dst, 0, 0);
1183             dst += stride;
1184             __lsx_vstelm_d(q0_out, dst, 0, 0);
1185             dst += stride;
1186             __lsx_vstelm_d(q1_out, dst, 0, 0);
1187             dst += stride;
1188             __lsx_vstelm_d(q2_out, dst, 0, 0);
1189         } else {
1190             /* LSB(right) 8 pixel operation */
1191             DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4,
1192                       p7_l, p6_l, p5_l, p4_l);
1193             DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7,
1194                       q4_l, q5_l, q6_l, q7_l);
1195 
1196             tmp0 = __lsx_vslli_h(p7_l, 3);
1197             tmp0 = __lsx_vsub_h(tmp0, p7_l);
1198             tmp0 = __lsx_vadd_h(tmp0, p6_l);
1199             tmp0 = __lsx_vadd_h(tmp0, q0_l);
1200 
1201             dst = dst_tmp - stride3;
1202 
1203             /* calculation of p6 and p5 */
1204             tmp1 = __lsx_vadd_h(p6_l, p5_l);
1205             tmp1 = __lsx_vadd_h(tmp1, p4_l);
1206             tmp1 = __lsx_vadd_h(tmp1, p3_l);
1207             tmp1 = __lsx_vadd_h(tmp1, p2_l);
1208             tmp1 = __lsx_vadd_h(tmp1, p1_l);
1209             tmp1 = __lsx_vadd_h(tmp1, p0_l);
1210             tmp1 = __lsx_vadd_h(tmp1, tmp0);
1211 
1212             p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1213             tmp0 = __lsx_vsub_h(p5_l, p6_l);
1214             tmp0 = __lsx_vadd_h(tmp0, q1_l);
1215             tmp0 = __lsx_vsub_h(tmp0, p7_l);
1216             tmp1 = __lsx_vadd_h(tmp1, tmp0);
1217 
1218             p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1219             DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1220                       p1_filter16, p0_filter16, p1_filter16);
1221             p0_filter16 = __lsx_vbitsel_v(p6, p0_filter16, flat2);
1222             p1_filter16 = __lsx_vbitsel_v(p5, p1_filter16, flat2);
1223             __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1224             dst += stride;
1225             __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1226             dst += stride;
1227 
1228             /* calculation of p4 and p3 */
1229             tmp0 = __lsx_vsub_h(p4_l, p5_l);
1230             tmp0 = __lsx_vadd_h(tmp0, q2_l);
1231             tmp0 = __lsx_vsub_h(tmp0, p7_l);
1232             tmp2 = __lsx_vsub_h(p3_l, p4_l);
1233             tmp2 = __lsx_vadd_h(tmp2, q3_l);
1234             tmp2 = __lsx_vsub_h(tmp2, p7_l);
1235             tmp1 = __lsx_vadd_h(tmp1, tmp0);
1236             p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1237             tmp1 = __lsx_vadd_h(tmp1, tmp2);
1238             p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1239             DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1240                       p1_filter16, p0_filter16, p1_filter16);
1241             p0_filter16 = __lsx_vbitsel_v(p4, p0_filter16, flat2);
1242             p1_filter16 = __lsx_vbitsel_v(p3, p1_filter16, flat2);
1243             __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1244             dst += stride;
1245             __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1246             dst += stride;
1247 
1248             /* calculation of p2 and p1 */
1249             tmp0 = __lsx_vsub_h(p2_l, p3_l);
1250             tmp0 = __lsx_vadd_h(tmp0, q4_l);
1251             tmp0 = __lsx_vsub_h(tmp0, p7_l);
1252             tmp2 = __lsx_vsub_h(p1_l, p2_l);
1253             tmp2 = __lsx_vadd_h(tmp2, q5_l);
1254             tmp2 = __lsx_vsub_h(tmp2, p7_l);
1255             tmp1 = __lsx_vadd_h(tmp1, tmp0);
1256             p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1257             tmp1 = __lsx_vadd_h(tmp1, tmp2);
1258             p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1259             DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1260                       p1_filter16, p0_filter16, p1_filter16);
1261             p0_filter16 = __lsx_vbitsel_v(p2_out, p0_filter16, flat2);
1262             p1_filter16 = __lsx_vbitsel_v(p1_out, p1_filter16, flat2);
1263             __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1264             dst += stride;
1265             __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1266             dst += stride;
1267 
1268             /* calculation of p0 and q0 */
1269             tmp0 = __lsx_vsub_h(p0_l, p1_l);
1270             tmp0 = __lsx_vadd_h(tmp0, q6_l);
1271             tmp0 = __lsx_vsub_h(tmp0, p7_l);
1272             tmp2 = __lsx_vsub_h(q7_l, p0_l);
1273             tmp2 = __lsx_vadd_h(tmp2, q0_l);
1274             tmp2 = __lsx_vsub_h(tmp2, p7_l);
1275             tmp1 = __lsx_vadd_h(tmp1, tmp0);
1276             p0_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1277             tmp1 = __lsx_vadd_h(tmp1, tmp2);
1278             p1_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1279             DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1280                       p1_filter16, p0_filter16, p1_filter16);
1281             p0_filter16 = __lsx_vbitsel_v(p0_out, p0_filter16, flat2);
1282             p1_filter16 = __lsx_vbitsel_v(q0_out, p1_filter16, flat2);
1283             __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1284             dst += stride;
1285             __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1286             dst += stride;
1287 
1288             /* calculation of q1 and q2 */
1289             tmp0 = __lsx_vsub_h(q7_l, q0_l);
1290             tmp0 = __lsx_vadd_h(tmp0, q1_l);
1291             tmp0 = __lsx_vsub_h(tmp0, p6_l);
1292             tmp2 = __lsx_vsub_h(q7_l, q1_l);
1293             tmp2 = __lsx_vadd_h(tmp2, q2_l);
1294             tmp2 = __lsx_vsub_h(tmp2, p5_l);
1295             tmp1 = __lsx_vadd_h(tmp1, tmp0);
1296             p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1297             tmp1 = __lsx_vadd_h(tmp1, tmp2);
1298             p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1299             DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1300                       p1_filter16, p0_filter16, p1_filter16);
1301             p0_filter16 = __lsx_vbitsel_v(q1_out, p0_filter16, flat2);
1302             p1_filter16 = __lsx_vbitsel_v(q2_out, p1_filter16, flat2);
1303             __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1304             dst += stride;
1305             __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1306             dst += stride;
1307 
1308             /* calculation of q3 and q4 */
1309             tmp0 = __lsx_vsub_h(q7_l, q2_l);
1310             tmp0 = __lsx_vadd_h(tmp0, q3_l);
1311             tmp0 = __lsx_vsub_h(tmp0, p4_l);
1312             tmp2 = __lsx_vsub_h(q7_l, q3_l);
1313             tmp2 = __lsx_vadd_h(tmp2, q4_l);
1314             tmp2 = __lsx_vsub_h(tmp2, p3_l);
1315             tmp1 = __lsx_vadd_h(tmp1, tmp0);
1316             p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1317             tmp1 = __lsx_vadd_h(tmp1, tmp2);
1318             p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1319             DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1320                       p1_filter16, p0_filter16, p1_filter16);
1321             p0_filter16 = __lsx_vbitsel_v(q3, p0_filter16, flat2);
1322             p1_filter16 = __lsx_vbitsel_v(q4, p1_filter16, flat2);
1323             __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1324             dst += stride;
1325             __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1326             dst += stride;
1327 
1328             /* calculation of q5 and q6 */
1329             tmp0 = __lsx_vsub_h(q7_l, q4_l);
1330             tmp0 = __lsx_vadd_h(tmp0, q5_l);
1331             tmp0 = __lsx_vsub_h(tmp0, p2_l);
1332             tmp2 = __lsx_vsub_h(q7_l, q5_l);
1333             tmp2 = __lsx_vadd_h(tmp2, q6_l);
1334             tmp2 = __lsx_vsub_h(tmp2, p1_l);
1335             tmp1 = __lsx_vadd_h(tmp1, tmp0);
1336             p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1337             tmp1 = __lsx_vadd_h(tmp1, tmp2);
1338             p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1339             DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1340                       p1_filter16, p0_filter16, p1_filter16);
1341             p0_filter16 = __lsx_vbitsel_v(q5, p0_filter16, flat2);
1342             p1_filter16 = __lsx_vbitsel_v(q6, p1_filter16, flat2);
1343             __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1344             dst += stride;
1345             __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1346         }
1347     }
1348 }
1349 
ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1350 void ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride,
1351                               int32_t b_limit_ptr,
1352                               int32_t limit_ptr,
1353                               int32_t thresh_ptr)
1354 {
1355     ptrdiff_t stride2 = stride << 1;
1356     ptrdiff_t stride3 = stride2 + stride;
1357     ptrdiff_t stride4 = stride2 << 1;
1358     uint8_t *dst_tmp1 = dst - 4;
1359     uint8_t *dst_tmp2 = dst_tmp1 + stride4;
1360     __m128i mask, hev, flat, limit, thresh, b_limit;
1361     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1362     __m128i vec0, vec1, vec2, vec3;
1363 
1364     p3 = __lsx_vld(dst_tmp1, 0);
1365     DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, p2, p1);
1366     p0 = __lsx_vldx(dst_tmp1, stride3);
1367     q0 = __lsx_vld(dst_tmp2, 0);
1368     DUP2_ARG2(__lsx_vldx, dst_tmp2, stride, dst_tmp2, stride2, q1, q2);
1369     q3 = __lsx_vldx(dst_tmp2, stride3);
1370 
1371     thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
1372     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1373     limit   = __lsx_vreplgr2vr_b(limit_ptr);
1374 
1375     LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3,
1376                        p3, p2, p1, p0, q0, q1, q2, q3);
1377     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1378                  hev, mask, flat);
1379     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1380     DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
1381     vec2 = __lsx_vilvl_h(vec1, vec0);
1382     vec3 = __lsx_vilvh_h(vec1, vec0);
1383 
1384     dst -= 2;
1385     __lsx_vstelm_w(vec2, dst, 0, 0);
1386     __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1387     __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1388     __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1389     dst += stride4;
1390     __lsx_vstelm_w(vec3, dst, 0, 0);
1391     __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1392     __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1393     __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1394 }
1395 
ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1396 void ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride,
1397                                 int32_t b_limit_ptr,
1398                                 int32_t limit_ptr,
1399                                 int32_t thresh_ptr)
1400 {
1401     ptrdiff_t stride2 = stride << 1;
1402     ptrdiff_t stride3 = stride2 + stride;
1403     ptrdiff_t stride4 = stride2 << 1;
1404     uint8_t *dst_tmp = dst - 4;
1405     __m128i mask, hev, flat;
1406     __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1407     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1408     __m128i row0, row1, row2, row3, row4, row5, row6, row7;
1409     __m128i row8, row9, row10, row11, row12, row13, row14, row15;
1410     __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1411 
1412     row0 = __lsx_vld(dst_tmp, 0);
1413     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row1, row2);
1414     row3 = __lsx_vldx(dst_tmp, stride3);
1415     dst_tmp += stride4;
1416     row4 = __lsx_vld(dst_tmp, 0);
1417     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1418     row7 = __lsx_vldx(dst_tmp, stride3);
1419     dst_tmp += stride4;
1420     row8 = __lsx_vld(dst_tmp, 0);
1421     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row9, row10);
1422     row11 = __lsx_vldx(dst_tmp, stride3);
1423     dst_tmp += stride4;
1424     row12 = __lsx_vld(dst_tmp, 0);
1425     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1426     row15 = __lsx_vldx(dst_tmp, stride3);
1427 
1428     LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
1429                         row8, row9, row10, row11, row12, row13, row14, row15,
1430                         p3, p2, p1, p0, q0, q1, q2, q3);
1431 
1432     thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
1433     thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1434     thresh0 = __lsx_vilvl_d(thresh1, thresh0);
1435 
1436     b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
1437     b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1438     b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
1439 
1440     limit0 = __lsx_vreplgr2vr_b(limit_ptr);
1441     limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1442     limit0 = __lsx_vilvl_d(limit1, limit0);
1443 
1444     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1445                  hev, mask, flat);
1446     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1447     DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
1448     tmp2 = __lsx_vilvl_h(tmp1, tmp0);
1449     tmp3 = __lsx_vilvh_h(tmp1, tmp0);
1450     DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
1451     tmp4 = __lsx_vilvl_h(tmp1, tmp0);
1452     tmp5 = __lsx_vilvh_h(tmp1, tmp0);
1453 
1454     dst -= 2;
1455     __lsx_vstelm_w(tmp2, dst, 0, 0);
1456     __lsx_vstelm_w(tmp2, dst + stride, 0, 1);
1457     __lsx_vstelm_w(tmp2, dst + stride2, 0, 2);
1458     __lsx_vstelm_w(tmp2, dst + stride3, 0, 3);
1459     dst += stride4;
1460     __lsx_vstelm_w(tmp3, dst, 0, 0);
1461     __lsx_vstelm_w(tmp3, dst + stride, 0, 1);
1462     __lsx_vstelm_w(tmp3, dst + stride2, 0, 2);
1463     __lsx_vstelm_w(tmp3, dst + stride3, 0, 3);
1464     dst += stride4;
1465     __lsx_vstelm_w(tmp4, dst, 0, 0);
1466     __lsx_vstelm_w(tmp4, dst + stride, 0, 1);
1467     __lsx_vstelm_w(tmp4, dst + stride2, 0, 2);
1468     __lsx_vstelm_w(tmp4, dst + stride3, 0, 3);
1469     dst += stride4;
1470     __lsx_vstelm_w(tmp5, dst, 0, 0);
1471     __lsx_vstelm_w(tmp5, dst + stride, 0, 1);
1472     __lsx_vstelm_w(tmp5, dst + stride2, 0, 2);
1473     __lsx_vstelm_w(tmp5, dst + stride3, 0, 3);
1474 }
1475 
ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1476 void ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride,
1477                               int32_t b_limit_ptr,
1478                               int32_t limit_ptr,
1479                               int32_t thresh_ptr)
1480 {
1481     ptrdiff_t stride2 = stride << 1;
1482     ptrdiff_t stride3 = stride2 + stride;
1483     ptrdiff_t stride4 = stride2 << 1;
1484     uint8_t *dst_tmp = dst - 4;
1485     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1486     __m128i p1_out, p0_out, q0_out, q1_out;
1487     __m128i flat, mask, hev, thresh, b_limit, limit;
1488     __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1489     __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1490     __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1491     __m128i vec0, vec1, vec2, vec3, vec4;
1492     __m128i zero = __lsx_vldi(0);
1493 
1494     /* load vector elements */
1495     p3 = __lsx_vld(dst_tmp, 0);
1496     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
1497     p0 = __lsx_vldx(dst_tmp, stride3);
1498     dst_tmp += stride4;
1499     q0 = __lsx_vld(dst_tmp, 0);
1500     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
1501     q3 = __lsx_vldx(dst_tmp, stride3);
1502 
1503     LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3,
1504                        p3, p2, p1, p0, q0, q1, q2, q3);
1505 
1506     thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
1507     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1508     limit   = __lsx_vreplgr2vr_b(limit_ptr);
1509 
1510     /* mask and hev */
1511     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1512                  hev, mask, flat);
1513     /* flat4 */
1514     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1515     /* filter4 */
1516     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1517                        q1_out);
1518 
1519     flat = __lsx_vilvl_d(zero, flat);
1520 
1521     /* if flat is zero for all pixels, then no need to calculate other filter */
1522     if (__lsx_bz_v(flat)) {
1523         /* Store 4 pixels p1-_q1 */
1524         DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1525         vec2 = __lsx_vilvl_h(vec1, vec0);
1526         vec3 = __lsx_vilvh_h(vec1, vec0);
1527 
1528         dst -= 2;
1529         __lsx_vstelm_w(vec2, dst, 0, 0);
1530         __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1531         __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1532         __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1533         dst += stride4;
1534         __lsx_vstelm_w(vec3, dst, 0, 0);
1535         __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1536         __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1537         __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1538     } else {
1539         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1540                   p3_l, p2_l, p1_l, p0_l);
1541         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1542                   q0_l, q1_l, q2_l, q3_l);
1543         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1544                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1545         /* convert 16 bit output data into 8 bit */
1546         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
1547                   p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l,
1548                   q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
1549                   q0_filt8_l);
1550         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
1551                   q2_filt8_l, q1_filt8_l, q2_filt8_l);
1552 
1553         /* store pixel values */
1554         p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1555         p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1556         p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1557         q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1558         q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1559         q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1560 
1561         /* Store 6 pixels p2-_q2 */
1562         DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1563         vec2 = __lsx_vilvl_h(vec1, vec0);
1564         vec3 = __lsx_vilvh_h(vec1, vec0);
1565         vec4 = __lsx_vilvl_b(q2, q1);
1566 
1567         dst -= 3;
1568         __lsx_vstelm_w(vec2, dst, 0, 0);
1569         __lsx_vstelm_h(vec4, dst, 4, 0);
1570         dst += stride;
1571         __lsx_vstelm_w(vec2, dst, 0, 1);
1572         __lsx_vstelm_h(vec4, dst, 4, 1);
1573         dst += stride;
1574         __lsx_vstelm_w(vec2, dst, 0, 2);
1575         __lsx_vstelm_h(vec4, dst, 4, 2);
1576         dst += stride;
1577         __lsx_vstelm_w(vec2, dst, 0, 3);
1578         __lsx_vstelm_h(vec4, dst, 4, 3);
1579         dst += stride;
1580         __lsx_vstelm_w(vec3, dst, 0, 0);
1581         __lsx_vstelm_h(vec4, dst, 4, 4);
1582         dst += stride;
1583         __lsx_vstelm_w(vec3, dst, 0, 1);
1584         __lsx_vstelm_h(vec4, dst, 4, 5);
1585         dst += stride;
1586         __lsx_vstelm_w(vec3, dst, 0, 2);
1587         __lsx_vstelm_h(vec4, dst, 4, 6);
1588         dst += stride;
1589         __lsx_vstelm_w(vec3, dst, 0, 3);
1590         __lsx_vstelm_h(vec4, dst, 4, 7);
1591     }
1592 }
1593 
ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1594 void ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride,
1595                                 int32_t b_limit_ptr,
1596                                 int32_t limit_ptr,
1597                                 int32_t thresh_ptr)
1598 {
1599     ptrdiff_t stride2 = stride << 1;
1600     ptrdiff_t stride3 = stride2 + stride;
1601     ptrdiff_t stride4 = stride2 << 1;
1602     uint8_t *dst_tmp = dst - 4;
1603     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1604     __m128i p1_out, p0_out, q0_out, q1_out;
1605     __m128i flat, mask, hev, thresh, b_limit, limit;
1606     __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1607     __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1608     __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1609     __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1610     __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1611     __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1612     __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1613     __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1614     __m128i zero = __lsx_vldi(0);
1615 
1616     p0 = __lsx_vld(dst_tmp, 0);
1617     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1618     p3 = __lsx_vldx(dst_tmp, stride3);
1619     dst_tmp += stride4;
1620     row4 = __lsx_vld(dst_tmp, 0);
1621     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1622     row7 = __lsx_vldx(dst_tmp, stride3);
1623     dst_tmp += stride4;
1624     q3 = __lsx_vld(dst_tmp, 0);
1625     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1626     q0 = __lsx_vldx(dst_tmp, stride3);
1627     dst_tmp += stride4;
1628     row12 = __lsx_vld(dst_tmp, 0);
1629     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1630     row15 = __lsx_vldx(dst_tmp, stride3);
1631 
1632     /* transpose 16x8 matrix into 8x16 */
1633     LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1634                         q3, q2, q1, q0, row12, row13, row14, row15,
1635                         p3, p2, p1, p0, q0, q1, q2, q3);
1636 
1637     thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1638     vec0   = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1639     thresh = __lsx_vilvl_d(vec0, thresh);
1640 
1641     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1642     vec0    = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1643     b_limit = __lsx_vilvl_d(vec0, b_limit);
1644 
1645     limit = __lsx_vreplgr2vr_b(limit_ptr);
1646     vec0  = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1647     limit = __lsx_vilvl_d(vec0, limit);
1648 
1649     /* mask and hev */
1650     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1651                  hev, mask, flat);
1652     /* flat4 */
1653     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1654     /* filter4 */
1655     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1656                        q1_out);
1657 
1658     /* if flat is zero for all pixels, then no need to calculate other filter */
1659     if (__lsx_bz_v(flat)) {
1660         DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1661         vec2 = __lsx_vilvl_h(vec1, vec0);
1662         vec3 = __lsx_vilvh_h(vec1, vec0);
1663         DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1664         vec4 = __lsx_vilvl_h(vec1, vec0);
1665         vec5 = __lsx_vilvh_h(vec1, vec0);
1666 
1667         dst -= 2;
1668         __lsx_vstelm_w(vec2, dst, 0, 0);
1669         __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1670         __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1671         __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1672         dst += stride4;
1673         __lsx_vstelm_w(vec3, dst, 0, 0);
1674         __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1675         __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1676         __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1677         dst += stride4;
1678         __lsx_vstelm_w(vec4, dst, 0, 0);
1679         __lsx_vstelm_w(vec4, dst + stride, 0, 1);
1680         __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
1681         __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
1682         dst += stride4;
1683         __lsx_vstelm_w(vec5, dst, 0, 0);
1684         __lsx_vstelm_w(vec5, dst + stride, 0, 1);
1685         __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
1686         __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
1687     } else {
1688         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1689                   p3_l, p2_l, p1_l, p0_l);
1690         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1691                   q0_l, q1_l, q2_l, q3_l);
1692         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1693                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1694 
1695         DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
1696                   p3_h, p2_h, p1_h, p0_h);
1697         DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
1698                   q0_h, q1_h, q2_h, q3_h);
1699 
1700         /* filter8 */
1701         VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
1702                     p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
1703 
1704         /* convert 16 bit output data into 8 bit */
1705         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
1706                   p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
1707                   p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1708         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
1709                   q2_filt8_l, q1_filt8_l, q2_filt8_l);
1710 
1711         /* store pixel values */
1712         p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1713         p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1714         p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1715         q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1716         q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1717         q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1718 
1719         DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1720         vec3 = __lsx_vilvl_h(vec1, vec0);
1721         vec4 = __lsx_vilvh_h(vec1, vec0);
1722         DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
1723         vec6 = __lsx_vilvl_h(vec1, vec0);
1724         vec7 = __lsx_vilvh_h(vec1, vec0);
1725         vec2 = __lsx_vilvl_b(q2, q1);
1726         vec5 = __lsx_vilvh_b(q2, q1);
1727 
1728         dst -= 3;
1729         __lsx_vstelm_w(vec3, dst, 0, 0);
1730         __lsx_vstelm_h(vec2, dst, 4, 0);
1731         dst += stride;
1732         __lsx_vstelm_w(vec3, dst, 0, 1);
1733         __lsx_vstelm_h(vec2, dst, 4, 1);
1734         dst += stride;
1735         __lsx_vstelm_w(vec3, dst, 0, 2);
1736         __lsx_vstelm_h(vec2, dst, 4, 2);
1737         dst += stride;
1738         __lsx_vstelm_w(vec3, dst, 0, 3);
1739         __lsx_vstelm_h(vec2, dst, 4, 3);
1740         dst += stride;
1741         __lsx_vstelm_w(vec4, dst, 0, 0);
1742         __lsx_vstelm_h(vec2, dst, 4, 4);
1743         dst += stride;
1744         __lsx_vstelm_w(vec4, dst, 0, 1);
1745         __lsx_vstelm_h(vec2, dst, 4, 5);
1746         dst += stride;
1747         __lsx_vstelm_w(vec4, dst, 0, 2);
1748         __lsx_vstelm_h(vec2, dst, 4, 6);
1749         dst += stride;
1750         __lsx_vstelm_w(vec4, dst, 0, 3);
1751         __lsx_vstelm_h(vec2, dst, 4, 7);
1752         dst += stride;
1753         __lsx_vstelm_w(vec6, dst, 0, 0);
1754         __lsx_vstelm_h(vec5, dst, 4, 0);
1755         dst += stride;
1756         __lsx_vstelm_w(vec6, dst, 0, 1);
1757         __lsx_vstelm_h(vec5, dst, 4, 1);
1758         dst += stride;
1759         __lsx_vstelm_w(vec6, dst, 0, 2);
1760         __lsx_vstelm_h(vec5, dst, 4, 2);
1761         dst += stride;
1762         __lsx_vstelm_w(vec6, dst, 0, 3);
1763         __lsx_vstelm_h(vec5, dst, 4, 3);
1764         dst += stride;
1765         __lsx_vstelm_w(vec7, dst, 0, 0);
1766         __lsx_vstelm_h(vec5, dst, 4, 4);
1767         dst += stride;
1768         __lsx_vstelm_w(vec7, dst, 0, 1);
1769         __lsx_vstelm_h(vec5, dst, 4, 5);
1770         dst += stride;
1771         __lsx_vstelm_w(vec7, dst, 0, 2);
1772         __lsx_vstelm_h(vec5, dst, 4, 6);
1773         dst += stride;
1774         __lsx_vstelm_w(vec7, dst, 0, 3);
1775         __lsx_vstelm_h(vec5, dst, 4, 7);
1776     }
1777 }
1778 
ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1779 void ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride,
1780                                 int32_t b_limit_ptr,
1781                                 int32_t limit_ptr,
1782                                 int32_t thresh_ptr)
1783 {
1784     ptrdiff_t stride2 = stride << 1;
1785     ptrdiff_t stride3 = stride2 + stride;
1786     ptrdiff_t stride4 = stride2 << 1;
1787     uint8_t *dst_tmp = dst - 4;
1788     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1789     __m128i p1_out, p0_out, q0_out, q1_out;
1790     __m128i flat, mask, hev, thresh, b_limit, limit;
1791     __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1792     __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1793     __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1794     __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1795     __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1796     __m128i zero = __lsx_vldi(0);
1797 
1798     p0 = __lsx_vld(dst_tmp, 0);
1799     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1800     p3 = __lsx_vldx(dst_tmp, stride3);
1801     dst_tmp += stride4;
1802     row4 = __lsx_vld(dst_tmp, 0);
1803     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1804     row7 = __lsx_vldx(dst_tmp, stride3);
1805     dst_tmp += stride4;
1806     q3 = __lsx_vld(dst_tmp, 0);
1807     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1808     q0 = __lsx_vldx(dst_tmp, stride3);
1809     dst_tmp += stride4;
1810     row12 = __lsx_vld(dst_tmp, 0);
1811     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1812     row15 = __lsx_vldx(dst_tmp, stride3);
1813 
1814     /* transpose 16x8 matrix into 8x16 */
1815     LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1816                         q3, q2, q1, q0, row12, row13, row14, row15,
1817                         p3, p2, p1, p0, q0, q1, q2, q3);
1818 
1819     thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1820     vec0   = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1821     thresh = __lsx_vilvl_d(vec0, thresh);
1822 
1823     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1824     vec0    = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1825     b_limit = __lsx_vilvl_d(vec0, b_limit);
1826 
1827     limit = __lsx_vreplgr2vr_b(limit_ptr);
1828     vec0  = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1829     limit = __lsx_vilvl_d(vec0, limit);
1830 
1831     /* mask and hev */
1832     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1833                  hev, mask, flat);
1834     /* flat4 */
1835     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1836     /* filter4 */
1837     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1838                        q1_out);
1839 
1840     flat = __lsx_vilvl_d(zero, flat);
1841 
1842     /* if flat is zero for all pixels, then no need to calculate other filter */
1843     if (__lsx_bz_v(flat)) {
1844         DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1845         vec2 = __lsx_vilvl_h(vec1, vec0);
1846         vec3 = __lsx_vilvh_h(vec1, vec0);
1847         DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1848         vec4 = __lsx_vilvl_h(vec1, vec0);
1849         vec5 = __lsx_vilvh_h(vec1, vec0);
1850 
1851         dst -= 2;
1852         __lsx_vstelm_w(vec2, dst, 0, 0);
1853         __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1854         __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1855         __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1856         dst += stride4;
1857         __lsx_vstelm_w(vec3, dst, 0, 0);
1858         __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1859         __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1860         __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1861         dst += stride4;
1862         __lsx_vstelm_w(vec4, dst, 0, 0);
1863         __lsx_vstelm_w(vec4, dst + stride, 0, 1);
1864         __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
1865         __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
1866         dst += stride4;
1867         __lsx_vstelm_w(vec5, dst, 0, 0);
1868         __lsx_vstelm_w(vec5, dst + stride, 0, 1);
1869         __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
1870         __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
1871     } else {
1872         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1873                   p3_l, p2_l, p1_l, p0_l);
1874         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1875                   q0_l, q1_l, q2_l, q3_l);
1876         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1877                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1878 
1879         /* convert 16 bit output data into 8 bit */
1880         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1881                   p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
1882                   p1_filt8_l, p0_filt8_l, q0_filt8_l);
1883         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1884                   q1_filt8_l, q2_filt8_l);
1885 
1886         /* store pixel values */
1887         p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1888         p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1889         p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1890         q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1891         q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1892         q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1893 
1894         DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1895         vec3 = __lsx_vilvl_h(vec1, vec0);
1896         vec4 = __lsx_vilvh_h(vec1, vec0);
1897         DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
1898         vec6 = __lsx_vilvl_h(vec1, vec0);
1899         vec7 = __lsx_vilvh_h(vec1, vec0);
1900         vec2 = __lsx_vilvl_b(q2, q1);
1901         vec5 = __lsx_vilvh_b(q2, q1);
1902 
1903         dst -= 3;
1904         __lsx_vstelm_w(vec3, dst, 0, 0);
1905         __lsx_vstelm_h(vec2, dst, 4, 0);
1906         dst += stride;
1907         __lsx_vstelm_w(vec3, dst, 0, 1);
1908         __lsx_vstelm_h(vec2, dst, 4, 1);
1909         dst += stride;
1910         __lsx_vstelm_w(vec3, dst, 0, 2);
1911         __lsx_vstelm_h(vec2, dst, 4, 2);
1912         dst += stride;
1913         __lsx_vstelm_w(vec3, dst, 0, 3);
1914         __lsx_vstelm_h(vec2, dst, 4, 3);
1915         dst += stride;
1916         __lsx_vstelm_w(vec4, dst, 0, 0);
1917         __lsx_vstelm_h(vec2, dst, 4, 4);
1918         dst += stride;
1919         __lsx_vstelm_w(vec4, dst, 0, 1);
1920         __lsx_vstelm_h(vec2, dst, 4, 5);
1921         dst += stride;
1922         __lsx_vstelm_w(vec4, dst, 0, 2);
1923         __lsx_vstelm_h(vec2, dst, 4, 6);
1924         dst += stride;
1925         __lsx_vstelm_w(vec4, dst, 0, 3);
1926         __lsx_vstelm_h(vec2, dst, 4, 7);
1927         dst += stride;
1928         __lsx_vstelm_w(vec6, dst, 0, 0);
1929         __lsx_vstelm_h(vec5, dst, 4, 0);
1930         dst += stride;
1931         __lsx_vstelm_w(vec6, dst, 0, 1);
1932         __lsx_vstelm_h(vec5, dst, 4, 1);
1933         dst += stride;
1934         __lsx_vstelm_w(vec6, dst, 0, 2);
1935         __lsx_vstelm_h(vec5, dst, 4, 2);
1936         dst += stride;
1937         __lsx_vstelm_w(vec6, dst, 0, 3);
1938         __lsx_vstelm_h(vec5, dst, 4, 3);
1939         dst += stride;
1940         __lsx_vstelm_w(vec7, dst, 0, 0);
1941         __lsx_vstelm_h(vec5, dst, 4, 4);
1942         dst += stride;
1943         __lsx_vstelm_w(vec7, dst, 0, 1);
1944         __lsx_vstelm_h(vec5, dst, 4, 5);
1945         dst += stride;
1946         __lsx_vstelm_w(vec7, dst, 0, 2);
1947         __lsx_vstelm_h(vec5, dst, 4, 6);
1948         dst += stride;
1949         __lsx_vstelm_w(vec7, dst, 0, 3);
1950         __lsx_vstelm_h(vec5, dst, 4, 7);
1951     }
1952 }
1953 
ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1954 void ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride,
1955                                 int32_t b_limit_ptr,
1956                                 int32_t limit_ptr,
1957                                 int32_t thresh_ptr)
1958 {
1959     ptrdiff_t stride2 = stride << 1;
1960     ptrdiff_t stride3 = stride2 + stride;
1961     ptrdiff_t stride4 = stride2 << 1;
1962     uint8_t *dst_tmp = dst - 4;
1963     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1964     __m128i p1_out, p0_out, q0_out, q1_out;
1965     __m128i flat, mask, hev, thresh, b_limit, limit;
1966     __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1967     __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1968     __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1969     __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1970     __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1971     __m128i zero = __lsx_vldi(0);
1972 
1973     p0 = __lsx_vld(dst_tmp, 0);
1974     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1975     p3 = __lsx_vldx(dst_tmp, stride3);
1976     dst_tmp += stride4;
1977     row4 = __lsx_vld(dst_tmp, 0);
1978     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1979     row7 = __lsx_vldx(dst_tmp, stride3);
1980     dst_tmp += stride4;
1981     q3 = __lsx_vld(dst_tmp, 0);
1982     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1983     q0 = __lsx_vldx(dst_tmp, stride3);
1984     dst_tmp += stride4;
1985     row12 = __lsx_vld(dst_tmp, 0);
1986     DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1987     row15 = __lsx_vldx(dst_tmp, stride3);
1988 
1989     /* transpose 16x8 matrix into 8x16 */
1990     LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1991                         q3, q2, q1, q0, row12, row13, row14, row15,
1992                         p3, p2, p1, p0, q0, q1, q2, q3);
1993 
1994     thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1995     vec0   = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1996     thresh = __lsx_vilvl_d(vec0, thresh);
1997 
1998     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1999     vec0    = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
2000     b_limit = __lsx_vilvl_d(vec0, b_limit);
2001 
2002     limit = __lsx_vreplgr2vr_b(limit_ptr);
2003     vec0  = __lsx_vreplgr2vr_b(limit_ptr >> 8);
2004     limit = __lsx_vilvl_d(vec0, limit);
2005 
2006     /* mask and hev */
2007     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2008                  hev, mask, flat);
2009     /* flat4 */
2010     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2011     /* filter4 */
2012     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2013                        q1_out);
2014 
2015     flat = __lsx_vilvh_d(flat, zero);
2016 
2017     /* if flat is zero for all pixels, then no need to calculate other filter */
2018     if (__lsx_bz_v(flat)) {
2019         DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2020         vec2 = __lsx_vilvl_h(vec1, vec0);
2021         vec3 = __lsx_vilvh_h(vec1, vec0);
2022         DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2023         vec4 = __lsx_vilvl_h(vec1, vec0);
2024         vec5 = __lsx_vilvh_h(vec1, vec0);
2025 
2026         dst -= 2;
2027         __lsx_vstelm_w(vec2, dst, 0, 0);
2028         __lsx_vstelm_w(vec2, dst + stride, 0, 1);
2029         __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
2030         __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
2031         dst += stride4;
2032         __lsx_vstelm_w(vec3, dst, 0, 0);
2033         __lsx_vstelm_w(vec3, dst + stride, 0, 1);
2034         __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
2035         __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
2036         dst += stride4;
2037         __lsx_vstelm_w(vec4, dst, 0, 0);
2038         __lsx_vstelm_w(vec4, dst + stride, 0, 1);
2039         __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
2040         __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
2041         dst += stride4;
2042         __lsx_vstelm_w(vec5, dst, 0, 0);
2043         __lsx_vstelm_w(vec5, dst + stride, 0, 1);
2044         __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
2045         __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
2046     } else {
2047         DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
2048                   p3_h, p2_h, p1_h, p0_h);
2049         DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
2050                   q0_h, q1_h, q2_h, q3_h);
2051 
2052         VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2053                     p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2054 
2055         /* convert 16 bit output data into 8 bit */
2056         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
2057                   p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
2058                   p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
2059         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
2060                   q2_filt8_h, q1_filt8_h, q2_filt8_h);
2061 
2062         /* store pixel values */
2063         p2 = __lsx_vbitsel_v(p2, p2_filt8_h, flat);
2064         p1 = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat);
2065         p0 = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat);
2066         q0 = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat);
2067         q1 = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat);
2068         q2 = __lsx_vbitsel_v(q2, q2_filt8_h, flat);
2069 
2070         DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2071         vec3 = __lsx_vilvl_h(vec1, vec0);
2072         vec4 = __lsx_vilvh_h(vec1, vec0);
2073         DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
2074         vec6 = __lsx_vilvl_h(vec1, vec0);
2075         vec7 = __lsx_vilvh_h(vec1, vec0);
2076         vec2 = __lsx_vilvl_b(q2, q1);
2077         vec5 = __lsx_vilvh_b(q2, q1);
2078 
2079         dst -= 3;
2080         __lsx_vstelm_w(vec3, dst, 0, 0);
2081         __lsx_vstelm_h(vec2, dst, 4, 0);
2082         dst += stride;
2083         __lsx_vstelm_w(vec3, dst, 0, 1);
2084         __lsx_vstelm_h(vec2, dst, 4, 1);
2085         dst += stride;
2086         __lsx_vstelm_w(vec3, dst, 0, 2);
2087         __lsx_vstelm_h(vec2, dst, 4, 2);
2088         dst += stride;
2089         __lsx_vstelm_w(vec3, dst, 0, 3);
2090         __lsx_vstelm_h(vec2, dst, 4, 3);
2091         dst += stride;
2092         __lsx_vstelm_w(vec4, dst, 0, 0);
2093         __lsx_vstelm_h(vec2, dst, 4, 4);
2094         dst += stride;
2095         __lsx_vstelm_w(vec4, dst, 0, 1);
2096         __lsx_vstelm_h(vec2, dst, 4, 5);
2097         dst += stride;
2098         __lsx_vstelm_w(vec4, dst, 0, 2);
2099         __lsx_vstelm_h(vec2, dst, 4, 6);
2100         dst += stride;
2101         __lsx_vstelm_w(vec4, dst, 0, 3);
2102         __lsx_vstelm_h(vec2, dst, 4, 7);
2103         dst += stride;
2104         __lsx_vstelm_w(vec6, dst, 0, 0);
2105         __lsx_vstelm_h(vec5, dst, 4, 0);
2106         dst += stride;
2107         __lsx_vstelm_w(vec6, dst, 0, 1);
2108         __lsx_vstelm_h(vec5, dst, 4, 1);
2109         dst += stride;
2110         __lsx_vstelm_w(vec6, dst, 0, 2);
2111         __lsx_vstelm_h(vec5, dst, 4, 2);
2112         dst += stride;
2113         __lsx_vstelm_w(vec6, dst, 0, 3);
2114         __lsx_vstelm_h(vec5, dst, 4, 3);
2115         dst += stride;
2116         __lsx_vstelm_w(vec7, dst, 0, 0);
2117         __lsx_vstelm_h(vec5, dst, 4, 4);
2118         dst += stride;
2119         __lsx_vstelm_w(vec7, dst, 0, 1);
2120         __lsx_vstelm_h(vec5, dst, 4, 5);
2121         dst += stride;
2122         __lsx_vstelm_w(vec7, dst, 0, 2);
2123         __lsx_vstelm_h(vec5, dst, 4, 6);
2124         dst += stride;
2125         __lsx_vstelm_w(vec7, dst, 0, 3);
2126         __lsx_vstelm_h(vec5, dst, 4, 7);
2127     }
2128 }
2129 
vp9_transpose_16x8_to_8x16(uint8_t *input, ptrdiff_t in_pitch, uint8_t *output)2130 static void vp9_transpose_16x8_to_8x16(uint8_t *input, ptrdiff_t in_pitch,
2131                                        uint8_t *output)
2132 {
2133     __m128i p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
2134     __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2135     __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2136     ptrdiff_t in_pitch2 = in_pitch << 1;
2137     ptrdiff_t in_pitch3 = in_pitch2 + in_pitch;
2138     ptrdiff_t in_pitch4 = in_pitch2 << 1;
2139 
2140     LSX_LD_8(input, in_pitch, in_pitch2, in_pitch3, in_pitch4,
2141              p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
2142     /* 8x8 transpose */
2143     LSX_TRANSPOSE8x8_B(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
2144                        p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
2145     /* 8x8 transpose */
2146     DUP4_ARG2(__lsx_vilvh_b, p5_org, p7_org, p4_org, p6_org, p1_org,
2147               p3_org, p0_org, p2_org, tmp0, tmp1, tmp2, tmp3);
2148     DUP2_ARG2(__lsx_vilvl_b, tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
2149     DUP2_ARG2(__lsx_vilvh_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
2150     DUP2_ARG2(__lsx_vilvl_w, tmp6, tmp4, tmp7, tmp5, q0, q4);
2151     DUP2_ARG2(__lsx_vilvh_w, tmp6, tmp4, tmp7, tmp5, q2, q6);
2152     DUP4_ARG2(__lsx_vbsrl_v, q0, 8, q2, 8, q4, 8, q6, 8, q1, q3, q5, q7);
2153 
2154     __lsx_vst(p7, output, 0);
2155     __lsx_vst(p6, output, 16);
2156     __lsx_vst(p5, output, 32);
2157     __lsx_vst(p4, output, 48);
2158     __lsx_vst(p3, output, 64);
2159     __lsx_vst(p2, output, 80);
2160     __lsx_vst(p1, output, 96);
2161     __lsx_vst(p0, output, 112);
2162     __lsx_vst(q0, output, 128);
2163     __lsx_vst(q1, output, 144);
2164     __lsx_vst(q2, output, 160);
2165     __lsx_vst(q3, output, 176);
2166     __lsx_vst(q4, output, 192);
2167     __lsx_vst(q5, output, 208);
2168     __lsx_vst(q6, output, 224);
2169     __lsx_vst(q7, output, 240);
2170 }
2171 
vp9_transpose_8x16_to_16x8(uint8_t *input, uint8_t *output, ptrdiff_t out_pitch)2172 static void vp9_transpose_8x16_to_16x8(uint8_t *input, uint8_t *output,
2173                                        ptrdiff_t out_pitch)
2174 {
2175     __m128i p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
2176     __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2177     ptrdiff_t out_pitch2 = out_pitch << 1;
2178     ptrdiff_t out_pitch3 = out_pitch2 + out_pitch;
2179     ptrdiff_t out_pitch4 = out_pitch2 << 1;
2180 
2181     DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48,
2182               p7, p6, p5, p4);
2183     DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112,
2184               p3, p2, p1, p0);
2185     DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176,
2186               q0, q1, q2, q3);
2187     DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240,
2188               q4, q5, q6, q7);
2189     LSX_TRANSPOSE16x8_B(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
2190                         q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
2191     LSX_ST_8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o,
2192              output, out_pitch, out_pitch2, out_pitch3, out_pitch4);
2193 }
2194 
vp9_transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output, int32_t out_stride)2195 static void vp9_transpose_16x16(uint8_t *input, int32_t in_stride,
2196                                 uint8_t *output, int32_t out_stride)
2197 {
2198     __m128i row0, row1, row2, row3, row4, row5, row6, row7;
2199     __m128i row8, row9, row10, row11, row12, row13, row14, row15;
2200     __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
2201     __m128i tmp2, tmp3;
2202     __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2203     int32_t in_stride2 = in_stride << 1;
2204     int32_t in_stride3 = in_stride2 + in_stride;
2205     int32_t in_stride4 = in_stride2 << 1;
2206     int32_t out_stride2 = out_stride << 1;
2207     int32_t out_stride3 = out_stride2 + out_stride;
2208     int32_t out_stride4 = out_stride2 << 1;
2209 
2210     LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4,
2211              row0, row1, row2, row3, row4, row5, row6, row7);
2212     input += in_stride4;
2213     LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4,
2214              row8, row9, row10, row11, row12, row13, row14, row15);
2215 
2216     LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
2217                         row8, row9, row10, row11, row12, row13, row14, row15,
2218                         p7, p6, p5, p4, p3, p2, p1, p0);
2219 
2220     /* transpose 16x8 matrix into 8x16 */
2221     /* total 8 intermediate register and 32 instructions */
2222     q7 = __lsx_vpackod_d(row8, row0);
2223     q6 = __lsx_vpackod_d(row9, row1);
2224     q5 = __lsx_vpackod_d(row10, row2);
2225     q4 = __lsx_vpackod_d(row11, row3);
2226     q3 = __lsx_vpackod_d(row12, row4);
2227     q2 = __lsx_vpackod_d(row13, row5);
2228     q1 = __lsx_vpackod_d(row14, row6);
2229     q0 = __lsx_vpackod_d(row15, row7);
2230 
2231     DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
2232     DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
2233 
2234     DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
2235     DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
2236 
2237     DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
2238     q0 = __lsx_vpackev_w(tmp3, tmp2);
2239     q4 = __lsx_vpackod_w(tmp3, tmp2);
2240 
2241     tmp2 = __lsx_vpackod_h(tmp1, tmp0);
2242     tmp3 = __lsx_vpackod_h(q7, q5);
2243     q2 = __lsx_vpackev_w(tmp3, tmp2);
2244     q6 = __lsx_vpackod_w(tmp3, tmp2);
2245 
2246     DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
2247     q1 = __lsx_vpackev_w(tmp3, tmp2);
2248     q5 = __lsx_vpackod_w(tmp3, tmp2);
2249 
2250     tmp2 = __lsx_vpackod_h(tmp5, tmp4);
2251     tmp3 = __lsx_vpackod_h(tmp7, tmp6);
2252     q3 = __lsx_vpackev_w(tmp3, tmp2);
2253     q7 = __lsx_vpackod_w(tmp3, tmp2);
2254 
2255     LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride,
2256              out_stride2, out_stride3, out_stride4);
2257     output += out_stride4;
2258     LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride,
2259              out_stride2, out_stride3, out_stride4);
2260 }
2261 
vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)2262 static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
2263                                        uint8_t *src_org, int32_t pitch_org,
2264                                        int32_t b_limit_ptr,
2265                                        int32_t limit_ptr,
2266                                        int32_t thresh_ptr)
2267 {
2268     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
2269     __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2270     __m128i flat, mask, hev, thresh, b_limit, limit;
2271     __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2272     __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2273     __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2274     __m128i vec0, vec1, vec2, vec3;
2275     __m128i zero = __lsx_vldi(0);
2276 
2277     /* load vector elements */
2278     DUP4_ARG2(__lsx_vld, src, -64, src, -48, src, -32, src, -16,
2279               p3, p2, p1, p0);
2280     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, q0, q1, q2, q3);
2281 
2282     thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
2283     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2284     limit   = __lsx_vreplgr2vr_b(limit_ptr);
2285 
2286     /* mask and hev */
2287     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2288                  hev, mask, flat);
2289     /* flat4 */
2290     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2291     /* filter4 */
2292     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2293                        q1_out);
2294 
2295     flat = __lsx_vilvl_d(zero, flat);
2296 
2297     /* if flat is zero for all pixels, then no need to calculate other filter */
2298     if (__lsx_bz_v(flat)) {
2299         DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2300         vec2 = __lsx_vilvl_h(vec1, vec0);
2301         vec3 = __lsx_vilvh_h(vec1, vec0);
2302 
2303         src_org -= 2;
2304         __lsx_vstelm_w(vec2, src_org, 0, 0);
2305         src_org += pitch_org;
2306         __lsx_vstelm_w(vec2, src_org, 0, 1);
2307         src_org += pitch_org;
2308         __lsx_vstelm_w(vec2, src_org, 0, 2);
2309         src_org += pitch_org;
2310         __lsx_vstelm_w(vec2, src_org, 0, 3);
2311         src_org += pitch_org;
2312         __lsx_vstelm_w(vec3, src_org, 0, 0);
2313         src_org += pitch_org;
2314         __lsx_vstelm_w(vec3, src_org, 0, 1);
2315         src_org += pitch_org;
2316         __lsx_vstelm_w(vec3, src_org, 0, 2);
2317         src_org += pitch_org;
2318         __lsx_vstelm_w(vec3, src_org, 0, 3);
2319         return 1;
2320     } else {
2321         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
2322                   p3_l, p2_l, p1_l, p0_l);
2323         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
2324                   q0_l, q1_l, q2_l, q3_l);
2325         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2326                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2327 
2328         /* convert 16 bit output data into 8 bit */
2329         p2_l = __lsx_vpickev_b(p2_filt8_l, p2_filt8_l);
2330         p1_l = __lsx_vpickev_b(p1_filt8_l, p1_filt8_l);
2331         p0_l = __lsx_vpickev_b(p0_filt8_l, p0_filt8_l);
2332         q0_l = __lsx_vpickev_b(q0_filt8_l, q0_filt8_l);
2333         q1_l = __lsx_vpickev_b(q1_filt8_l, q1_filt8_l);
2334         q2_l = __lsx_vpickev_b(q2_filt8_l, q2_filt8_l);
2335 
2336         /* store pixel values */
2337         p2_out = __lsx_vbitsel_v(p2, p2_l, flat);
2338         p1_out = __lsx_vbitsel_v(p1_out, p1_l, flat);
2339         p0_out = __lsx_vbitsel_v(p0_out, p0_l, flat);
2340         q0_out = __lsx_vbitsel_v(q0_out, q0_l, flat);
2341         q1_out = __lsx_vbitsel_v(q1_out, q1_l, flat);
2342         q2_out = __lsx_vbitsel_v(q2, q2_l, flat);
2343 
2344         __lsx_vst(p2_out, filter48, 0);
2345         __lsx_vst(p1_out, filter48, 16);
2346         __lsx_vst(p0_out, filter48, 32);
2347         __lsx_vst(q0_out, filter48, 48);
2348         __lsx_vst(q1_out, filter48, 64);
2349         __lsx_vst(q2_out, filter48, 80);
2350         __lsx_vst(flat, filter48, 96);
2351 
2352         return 0;
2353     }
2354 }
2355 
vp9_vt_lpf_t16_8w(uint8_t *dst, uint8_t *dst_org, ptrdiff_t stride, uint8_t *filter48)2356 static int32_t vp9_vt_lpf_t16_8w(uint8_t *dst, uint8_t *dst_org,
2357                                  ptrdiff_t stride,
2358                                  uint8_t *filter48)
2359 {
2360     __m128i zero = __lsx_vldi(0);
2361     __m128i filter8, flat, flat2;
2362     __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2363     v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2364     v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2365     v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2366     v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2367     v8u16 tmp0_l, tmp1_l;
2368     __m128i out_l;
2369     uint8_t *dst_tmp = dst - 128;
2370 
2371     /* load vector elements */
2372     DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2373               dst_tmp, 48, p7, p6, p5, p4);
2374     DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2375               dst_tmp, 112, p3, p2, p1, p0);
2376     DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2377     DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
2378 
2379     flat = __lsx_vld(filter48, 96);
2380 
2381 
2382     VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2383 
2384     /* if flat2 is zero for all pixels, then no need to calculate other filter */
2385     if (__lsx_bz_v(flat2)) {
2386         __m128i vec0, vec1, vec2, vec3, vec4;
2387 
2388         DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2389                   filter48, 48, p2, p1, p0, q0);
2390         DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
2391 
2392         DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2393         vec3 = __lsx_vilvl_h(vec1, vec0);
2394         vec4 = __lsx_vilvh_h(vec1, vec0);
2395         vec2 = __lsx_vilvl_b(q2, q1);
2396 
2397         dst_org -= 3;
2398         __lsx_vstelm_w(vec3, dst_org, 0, 0);
2399         __lsx_vstelm_h(vec2, dst_org, 4, 0);
2400         dst_org += stride;
2401         __lsx_vstelm_w(vec3, dst_org, 0, 1);
2402         __lsx_vstelm_h(vec2, dst_org, 4, 1);
2403         dst_org += stride;
2404         __lsx_vstelm_w(vec3, dst_org, 0, 2);
2405         __lsx_vstelm_h(vec2, dst_org, 4, 2);
2406         dst_org += stride;
2407         __lsx_vstelm_w(vec3, dst_org, 0, 3);
2408         __lsx_vstelm_h(vec2, dst_org, 4, 3);
2409         dst_org += stride;
2410         __lsx_vstelm_w(vec4, dst_org, 0, 0);
2411         __lsx_vstelm_h(vec2, dst_org, 4, 4);
2412         dst_org += stride;
2413         __lsx_vstelm_w(vec4, dst_org, 0, 1);
2414         __lsx_vstelm_h(vec2, dst_org, 4, 5);
2415         dst_org += stride;
2416         __lsx_vstelm_w(vec4, dst_org, 0, 2);
2417         __lsx_vstelm_h(vec2, dst_org, 4, 6);
2418         dst_org += stride;
2419         __lsx_vstelm_w(vec4, dst_org, 0, 3);
2420         __lsx_vstelm_h(vec2, dst_org, 4, 7);
2421         return 1;
2422     } else {
2423         dst -= 7 * 16;
2424 
2425         p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
2426         p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
2427         p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
2428         p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
2429         p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
2430         p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
2431         p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
2432         p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
2433         q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
2434 
2435         tmp0_l = p7_l_in << 3;
2436         tmp0_l -= p7_l_in;
2437         tmp0_l += p6_l_in;
2438         tmp0_l += q0_l_in;
2439         tmp1_l = p6_l_in + p5_l_in;
2440         tmp1_l += p4_l_in;
2441         tmp1_l += p3_l_in;
2442         tmp1_l += p2_l_in;
2443         tmp1_l += p1_l_in;
2444         tmp1_l += p0_l_in;
2445         tmp1_l += tmp0_l;
2446 
2447         out_l =__lsx_vsrari_h((__m128i)tmp1_l, 4);
2448         out_l =__lsx_vpickev_b(out_l, out_l);
2449         p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2450         __lsx_vstelm_d(p6, dst, 0, 0);
2451         dst += 16;
2452 
2453         /* p5 */
2454         q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
2455         tmp0_l = p5_l_in - p6_l_in;
2456         tmp0_l += q1_l_in;
2457         tmp0_l -= p7_l_in;
2458         tmp1_l += tmp0_l;
2459         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2460         out_l = __lsx_vpickev_b(out_l, out_l);
2461         p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2462         __lsx_vstelm_d(p5, dst, 0, 0);
2463         dst += 16;
2464 
2465         /* p4 */
2466         q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
2467         tmp0_l = p4_l_in - p5_l_in;
2468         tmp0_l += q2_l_in;
2469         tmp0_l -= p7_l_in;
2470         tmp1_l += tmp0_l;
2471         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2472         out_l = __lsx_vpickev_b(out_l, out_l);
2473         p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2474         __lsx_vstelm_d(p4, dst, 0, 0);
2475         dst += 16;
2476 
2477         /* p3 */
2478         q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
2479         tmp0_l = p3_l_in - p4_l_in;
2480         tmp0_l += q3_l_in;
2481         tmp0_l -= p7_l_in;
2482         tmp1_l += tmp0_l;
2483         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2484         out_l = __lsx_vpickev_b(out_l, out_l);
2485         p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2486         __lsx_vstelm_d(p3, dst, 0, 0);
2487         dst += 16;
2488 
2489         /* p2 */
2490         q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
2491         filter8 = __lsx_vld(filter48, 0);
2492         tmp0_l = p2_l_in - p3_l_in;
2493         tmp0_l += q4_l_in;
2494         tmp0_l -= p7_l_in;
2495         tmp1_l += tmp0_l;
2496         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2497         out_l = __lsx_vpickev_b(out_l, out_l);
2498         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2499         __lsx_vstelm_d(filter8, dst, 0, 0);
2500         dst += 16;
2501 
2502         /* p1 */
2503         q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
2504         filter8 = __lsx_vld(filter48, 16);
2505         tmp0_l = p1_l_in - p2_l_in;
2506         tmp0_l += q5_l_in;
2507         tmp0_l -= p7_l_in;
2508         tmp1_l += tmp0_l;
2509         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2510         out_l = __lsx_vpickev_b(out_l, out_l);
2511         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2512         __lsx_vstelm_d(filter8, dst, 0, 0);
2513         dst += 16;
2514 
2515         /* p0 */
2516         q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
2517         filter8 = __lsx_vld(filter48, 32);
2518         tmp0_l = p0_l_in - p1_l_in;
2519         tmp0_l += q6_l_in;
2520         tmp0_l -= p7_l_in;
2521         tmp1_l += tmp0_l;
2522         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2523         out_l = __lsx_vpickev_b(out_l, out_l);
2524         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2525         __lsx_vstelm_d(filter8, dst, 0, 0);
2526         dst += 16;
2527 
2528         /* q0 */
2529         q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
2530         filter8 = __lsx_vld(filter48, 48);
2531         tmp0_l = q7_l_in - p0_l_in;
2532         tmp0_l += q0_l_in;
2533         tmp0_l -= p7_l_in;
2534         tmp1_l += tmp0_l;
2535         out_l = __lsx_vsrari_h((v8i16) tmp1_l, 4);
2536         out_l = __lsx_vpickev_b(out_l, out_l);
2537         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2538         __lsx_vstelm_d(filter8, dst, 0, 0);
2539         dst += 16;
2540 
2541         /* q1 */
2542         filter8 = __lsx_vld(filter48, 64);
2543         tmp0_l = q7_l_in - q0_l_in;
2544         tmp0_l += q1_l_in;
2545         tmp0_l -= p6_l_in;
2546         tmp1_l += tmp0_l;
2547         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2548         out_l = __lsx_vpickev_b(out_l, out_l);
2549         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2550         __lsx_vstelm_d(filter8, dst, 0, 0);
2551         dst += 16;
2552 
2553         /* q2 */
2554         filter8 = __lsx_vld(filter48, 80);
2555         tmp0_l = q7_l_in - q1_l_in;
2556         tmp0_l += q2_l_in;
2557         tmp0_l -= p5_l_in;
2558         tmp1_l += tmp0_l;
2559         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2560         out_l = __lsx_vpickev_b(out_l, out_l);
2561         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2562         __lsx_vstelm_d(filter8, dst, 0, 0);
2563         dst += 16;
2564 
2565         /* q3 */
2566         tmp0_l = q7_l_in - q2_l_in;
2567         tmp0_l += q3_l_in;
2568         tmp0_l -= p4_l_in;
2569         tmp1_l += tmp0_l;
2570         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2571         out_l = __lsx_vpickev_b(out_l, out_l);
2572         q3 = __lsx_vbitsel_v(q3, out_l, flat2);
2573         __lsx_vstelm_d(q3, dst, 0, 0);
2574         dst += 16;
2575 
2576         /* q4 */
2577         tmp0_l = q7_l_in - q3_l_in;
2578         tmp0_l += q4_l_in;
2579         tmp0_l -= p3_l_in;
2580         tmp1_l += tmp0_l;
2581         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2582         out_l = __lsx_vpickev_b(out_l, out_l);
2583         q4 = __lsx_vbitsel_v(q4, out_l, flat2);
2584         __lsx_vstelm_d(q4, dst, 0, 0);
2585         dst += 16;
2586 
2587         /* q5 */
2588         tmp0_l = q7_l_in - q4_l_in;
2589         tmp0_l += q5_l_in;
2590         tmp0_l -= p2_l_in;
2591         tmp1_l += tmp0_l;
2592         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2593         out_l = __lsx_vpickev_b(out_l, out_l);
2594         q5 = __lsx_vbitsel_v(q5, out_l, flat2);
2595         __lsx_vstelm_d(q5, dst, 0, 0);
2596         dst += 16;
2597 
2598         /* q6 */
2599         tmp0_l = q7_l_in - q5_l_in;
2600         tmp0_l += q6_l_in;
2601         tmp0_l -= p1_l_in;
2602         tmp1_l += tmp0_l;
2603         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2604         out_l = __lsx_vpickev_b(out_l, out_l);
2605         q6 = __lsx_vbitsel_v(q6, out_l, flat2);
2606         __lsx_vstelm_d(q6, dst, 0, 0);
2607 
2608         return 0;
2609     }
2610 }
2611 
ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)2612 void ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride,
2613                                int32_t b_limit_ptr,
2614                                int32_t limit_ptr,
2615                                int32_t thresh_ptr)
2616 {
2617     uint8_t early_exit = 0;
2618     uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16)));
2619     uint8_t *filter48 = &transposed_input[16 * 16];
2620 
2621     vp9_transpose_16x8_to_8x16(dst - 8, stride, transposed_input);
2622 
2623     early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2624                                          &filter48[0], dst, stride,
2625                                          b_limit_ptr, limit_ptr, thresh_ptr);
2626 
2627     if (0 == early_exit) {
2628         early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), dst, stride,
2629                                        &filter48[0]);
2630 
2631         if (0 == early_exit) {
2632             vp9_transpose_8x16_to_16x8(transposed_input, dst - 8, stride);
2633         }
2634     }
2635 }
2636 
vp9_vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, uint8_t *dst_org, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)2637 static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
2638                                         uint8_t *dst_org, ptrdiff_t stride,
2639                                         int32_t b_limit_ptr,
2640                                         int32_t limit_ptr,
2641                                         int32_t thresh_ptr)
2642 {
2643     ptrdiff_t stride2 = stride << 1;
2644     ptrdiff_t stride3 = stride2 + stride;
2645     ptrdiff_t stride4 = stride2 << 1;
2646     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
2647     __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2648     __m128i flat, mask, hev, thresh, b_limit, limit;
2649     __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2650     __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
2651     __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2652     __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2653     __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
2654     __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
2655     __m128i vec0, vec1, vec2, vec3, vec4, vec5;
2656     __m128i zero = __lsx_vldi(0);
2657 
2658     /* load vector elements */
2659     DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16,
2660               p3, p2, p1, p0);
2661     DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2662 
2663     thresh  = __lsx_vreplgr2vr_b(thresh_ptr);
2664     b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2665     limit   = __lsx_vreplgr2vr_b(limit_ptr);
2666 
2667     /* mask and hev */
2668     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2669                  hev, mask, flat);
2670     /* flat4 */
2671     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2672     /* filter4 */
2673     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2674                        q1_out);
2675 
2676     /* if flat is zero for all pixels, then no need to calculate other filter */
2677     if (__lsx_bz_v(flat)) {
2678         DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2679         vec2 = __lsx_vilvl_h(vec1, vec0);
2680         vec3 = __lsx_vilvh_h(vec1, vec0);
2681         DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2682         vec4 = __lsx_vilvl_h(vec1, vec0);
2683         vec5 = __lsx_vilvh_h(vec1, vec0);
2684 
2685         dst_org -= 2;
2686         __lsx_vstelm_w(vec2, dst_org, 0, 0);
2687         __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
2688         __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
2689         __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
2690         dst_org += stride4;
2691         __lsx_vstelm_w(vec3, dst_org, 0, 0);
2692         __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
2693         __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
2694         __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
2695         dst_org += stride4;
2696         __lsx_vstelm_w(vec4, dst_org, 0, 0);
2697         __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
2698         __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
2699         __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
2700         dst_org += stride4;
2701         __lsx_vstelm_w(vec5, dst_org, 0, 0);
2702         __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
2703         __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
2704         __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
2705 
2706         return 1;
2707     } else {
2708         DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
2709                   p3_l, p2_l, p1_l, p0_l);
2710         DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
2711                   q0_l, q1_l, q2_l, q3_l);
2712         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2713                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2714         DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
2715                   p3_h, p2_h, p1_h, p0_h);
2716         DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
2717                   q0_h, q1_h, q2_h, q3_h);
2718         VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2719                     p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2720 
2721         /* convert 16 bit output data into 8 bit */
2722         DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
2723                       p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h,
2724                       q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
2725                       q0_filt8_l);
2726         DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
2727                   q2_filt8_l, q1_filt8_l, q2_filt8_l);
2728 
2729         /* store pixel values */
2730         p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
2731         p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
2732         p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
2733         q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
2734         q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
2735         q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
2736 
2737         __lsx_vst(p2_out, filter48, 0);
2738         __lsx_vst(p1_out, filter48, 16);
2739         __lsx_vst(p0_out, filter48, 32);
2740         __lsx_vst(q0_out, filter48, 48);
2741         __lsx_vst(q1_out, filter48, 64);
2742         __lsx_vst(q2_out, filter48, 80);
2743         __lsx_vst(flat, filter48, 96);
2744 
2745         return 0;
2746     }
2747 }
2748 
vp9_vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, ptrdiff_t stride, uint8_t *filter48)2749 static int32_t vp9_vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org,
2750                                   ptrdiff_t stride,
2751                                   uint8_t *filter48)
2752 {
2753     __m128i zero = __lsx_vldi(0);
2754     __m128i flat, flat2, filter8;
2755     __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2756     v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2757     v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2758     v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2759     v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2760     v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
2761     v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
2762     v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
2763     v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
2764     v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
2765     __m128i out_l, out_h;
2766     uint8_t *dst_tmp = dst - 128;
2767 
2768     flat = __lsx_vld(filter48, 96);
2769 
2770     DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2771               dst_tmp, 48, p7, p6, p5, p4);
2772     DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2773               dst_tmp, 112, p3, p2, p1, p0);
2774     DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2775     DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
2776 
2777     VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2778 
2779     /* if flat2 is zero for all pixels, then no need to calculate other filter */
2780     if (__lsx_bz_v(flat2)) {
2781         __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2782 
2783         DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2784                   filter48, 48, p2, p1, p0, q0);
2785         DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
2786 
2787         DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2788         vec3 = __lsx_vilvl_h(vec1, vec0);
2789         vec4 = __lsx_vilvh_h(vec1, vec0);
2790         DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
2791         vec6 = __lsx_vilvl_h(vec1, vec0);
2792         vec7 = __lsx_vilvh_h(vec1, vec0);
2793         vec2 = __lsx_vilvl_b(q2, q1);
2794         vec5 = __lsx_vilvh_b(q2, q1);
2795 
2796         dst_org -= 3;
2797         __lsx_vstelm_w(vec3, dst_org, 0, 0);
2798         __lsx_vstelm_h(vec2, dst_org, 4, 0);
2799         dst_org += stride;
2800         __lsx_vstelm_w(vec3, dst_org, 0, 1);
2801         __lsx_vstelm_h(vec2, dst_org, 4, 1);
2802         dst_org += stride;
2803         __lsx_vstelm_w(vec3, dst_org, 0, 2);
2804         __lsx_vstelm_h(vec2, dst_org, 4, 2);
2805         dst_org += stride;
2806         __lsx_vstelm_w(vec3, dst_org, 0, 3);
2807         __lsx_vstelm_h(vec2, dst_org, 4, 3);
2808         dst_org += stride;
2809         __lsx_vstelm_w(vec4, dst_org, 0, 0);
2810         __lsx_vstelm_h(vec2, dst_org, 4, 4);
2811         dst_org += stride;
2812         __lsx_vstelm_w(vec4, dst_org, 0, 1);
2813         __lsx_vstelm_h(vec2, dst_org, 4, 5);
2814         dst_org += stride;
2815         __lsx_vstelm_w(vec4, dst_org, 0, 2);
2816         __lsx_vstelm_h(vec2, dst_org, 4, 6);
2817         dst_org += stride;
2818         __lsx_vstelm_w(vec4, dst_org, 0, 3);
2819         __lsx_vstelm_h(vec2, dst_org, 4, 7);
2820         dst_org += stride;
2821         __lsx_vstelm_w(vec6, dst_org, 0, 0);
2822         __lsx_vstelm_h(vec5, dst_org, 4, 0);
2823         dst_org += stride;
2824         __lsx_vstelm_w(vec6, dst_org, 0, 1);
2825         __lsx_vstelm_h(vec5, dst_org, 4, 1);
2826         dst_org += stride;
2827         __lsx_vstelm_w(vec6, dst_org, 0, 2);
2828         __lsx_vstelm_h(vec5, dst_org, 4, 2);
2829         dst_org += stride;
2830         __lsx_vstelm_w(vec6, dst_org, 0, 3);
2831         __lsx_vstelm_h(vec5, dst_org, 4, 3);
2832         dst_org += stride;
2833         __lsx_vstelm_w(vec7, dst_org, 0, 0);
2834         __lsx_vstelm_h(vec5, dst_org, 4, 4);
2835         dst_org += stride;
2836         __lsx_vstelm_w(vec7, dst_org, 0, 1);
2837         __lsx_vstelm_h(vec5, dst_org, 4, 5);
2838         dst_org += stride;
2839         __lsx_vstelm_w(vec7, dst_org, 0, 2);
2840         __lsx_vstelm_h(vec5, dst_org, 4, 6);
2841         dst_org += stride;
2842         __lsx_vstelm_w(vec7, dst_org, 0, 3);
2843         __lsx_vstelm_h(vec5, dst_org, 4, 7);
2844 
2845         return 1;
2846     } else {
2847         dst -= 7 * 16;
2848 
2849         p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
2850         p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
2851         p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
2852         p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
2853         p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
2854         p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
2855         p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
2856         p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
2857         q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
2858 
2859         tmp0_l = p7_l_in << 3;
2860         tmp0_l -= p7_l_in;
2861         tmp0_l += p6_l_in;
2862         tmp0_l += q0_l_in;
2863         tmp1_l = p6_l_in + p5_l_in;
2864         tmp1_l += p4_l_in;
2865         tmp1_l += p3_l_in;
2866         tmp1_l += p2_l_in;
2867         tmp1_l += p1_l_in;
2868         tmp1_l += p0_l_in;
2869         tmp1_l += tmp0_l;
2870         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2871 
2872         p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
2873         p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
2874         p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
2875         p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
2876         p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
2877         p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
2878         p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
2879         p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
2880         q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
2881 
2882         tmp0_h = p7_h_in << 3;
2883         tmp0_h -= p7_h_in;
2884         tmp0_h += p6_h_in;
2885         tmp0_h += q0_h_in;
2886         tmp1_h = p6_h_in + p5_h_in;
2887         tmp1_h += p4_h_in;
2888         tmp1_h += p3_h_in;
2889         tmp1_h += p2_h_in;
2890         tmp1_h += p1_h_in;
2891         tmp1_h += p0_h_in;
2892         tmp1_h += tmp0_h;
2893         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2894 
2895         out_l = __lsx_vpickev_b(out_h, out_l);
2896         p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2897         __lsx_vst(p6, dst, 0);
2898 
2899         /* p5 */
2900         q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
2901         tmp0_l = p5_l_in - p6_l_in;
2902         tmp0_l += q1_l_in;
2903         tmp0_l -= p7_l_in;
2904         tmp1_l += tmp0_l;
2905         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2906         q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
2907         tmp0_h = p5_h_in - p6_h_in;
2908         tmp0_h += q1_h_in;
2909         tmp0_h -= p7_h_in;
2910         tmp1_h += tmp0_h;
2911         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2912         out_l = __lsx_vpickev_b(out_h, out_l);
2913         p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2914         __lsx_vst(p5, dst, 16);
2915 
2916         /* p4 */
2917         q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
2918         tmp0_l = p4_l_in - p5_l_in;
2919         tmp0_l += q2_l_in;
2920         tmp0_l -= p7_l_in;
2921         tmp1_l += tmp0_l;
2922         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2923         q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
2924         tmp0_h = p4_h_in - p5_h_in;
2925         tmp0_h += q2_h_in;
2926         tmp0_h -= p7_h_in;
2927         tmp1_h += tmp0_h;
2928         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2929         out_l = __lsx_vpickev_b(out_h, out_l);
2930         p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2931         __lsx_vst(p4, dst, 16*2);
2932 
2933         /* p3 */
2934         q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
2935         tmp0_l = p3_l_in - p4_l_in;
2936         tmp0_l += q3_l_in;
2937         tmp0_l -= p7_l_in;
2938         tmp1_l += tmp0_l;
2939         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2940         q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
2941         tmp0_h = p3_h_in - p4_h_in;
2942         tmp0_h += q3_h_in;
2943         tmp0_h -= p7_h_in;
2944         tmp1_h += tmp0_h;
2945         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2946         out_l = __lsx_vpickev_b(out_h, out_l);
2947         p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2948         __lsx_vst(p3, dst, 16*3);
2949 
2950         /* p2 */
2951         q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
2952         filter8 = __lsx_vld(filter48, 0);
2953         tmp0_l = p2_l_in - p3_l_in;
2954         tmp0_l += q4_l_in;
2955         tmp0_l -= p7_l_in;
2956         tmp1_l += tmp0_l;
2957         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2958         q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
2959         tmp0_h = p2_h_in - p3_h_in;
2960         tmp0_h += q4_h_in;
2961         tmp0_h -= p7_h_in;
2962         tmp1_h += tmp0_h;
2963         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2964         out_l = __lsx_vpickev_b(out_h, out_l);
2965         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2966         __lsx_vst(filter8, dst, 16*4);
2967 
2968         /* p1 */
2969         q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
2970         filter8 = __lsx_vld(filter48, 16);
2971         tmp0_l = p1_l_in - p2_l_in;
2972         tmp0_l += q5_l_in;
2973         tmp0_l -= p7_l_in;
2974         tmp1_l += tmp0_l;
2975         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2976         q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
2977         tmp0_h = p1_h_in - p2_h_in;
2978         tmp0_h += q5_h_in;
2979         tmp0_h -= p7_h_in;
2980         tmp1_h += tmp0_h;
2981         out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
2982         out_l = __lsx_vpickev_b(out_h, out_l);
2983         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2984         __lsx_vst(filter8, dst, 16*5);
2985 
2986         /* p0 */
2987         q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
2988         filter8 = __lsx_vld(filter48, 32);
2989         tmp0_l = p0_l_in - p1_l_in;
2990         tmp0_l += q6_l_in;
2991         tmp0_l -= p7_l_in;
2992         tmp1_l += tmp0_l;
2993         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2994         q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
2995         tmp0_h = p0_h_in - p1_h_in;
2996         tmp0_h += q6_h_in;
2997         tmp0_h -= p7_h_in;
2998         tmp1_h += tmp0_h;
2999         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3000         out_l = __lsx_vpickev_b(out_h, out_l);
3001         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3002         __lsx_vst(filter8, dst, 16*6);
3003 
3004         /* q0 */
3005         q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
3006         filter8 = __lsx_vld(filter48, 48);
3007         tmp0_l = q7_l_in - p0_l_in;
3008         tmp0_l += q0_l_in;
3009         tmp0_l -= p7_l_in;
3010         tmp1_l += tmp0_l;
3011         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3012         q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
3013         tmp0_h = q7_h_in - p0_h_in;
3014         tmp0_h += q0_h_in;
3015         tmp0_h -= p7_h_in;
3016         tmp1_h += tmp0_h;
3017         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3018         out_l = __lsx_vpickev_b(out_h, out_l);
3019         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3020         __lsx_vst(filter8, dst, 16*7);
3021 
3022         /* q1 */
3023         filter8 = __lsx_vld(filter48, 64);
3024         tmp0_l = q7_l_in - q0_l_in;
3025         tmp0_l += q1_l_in;
3026         tmp0_l -= p6_l_in;
3027         tmp1_l += tmp0_l;
3028         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3029         tmp0_h = q7_h_in - q0_h_in;
3030         tmp0_h += q1_h_in;
3031         tmp0_h -= p6_h_in;
3032         tmp1_h += tmp0_h;
3033         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3034         out_l = __lsx_vpickev_b(out_h, out_l);
3035         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3036         __lsx_vst(filter8, dst, 16*8);
3037 
3038         /* q2 */
3039         filter8 = __lsx_vld(filter48, 80);
3040         tmp0_l = q7_l_in - q1_l_in;
3041         tmp0_l += q2_l_in;
3042         tmp0_l -= p5_l_in;
3043         tmp1_l += tmp0_l;
3044         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3045         tmp0_h = q7_h_in - q1_h_in;
3046         tmp0_h += q2_h_in;
3047         tmp0_h -= p5_h_in;
3048         tmp1_h += tmp0_h;
3049         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3050         out_l = __lsx_vpickev_b(out_h, out_l);
3051         filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3052         __lsx_vst(filter8, dst, 16*9);
3053 
3054         /* q3 */
3055         tmp0_l = q7_l_in - q2_l_in;
3056         tmp0_l += q3_l_in;
3057         tmp0_l -= p4_l_in;
3058         tmp1_l += tmp0_l;
3059         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3060         tmp0_h = q7_h_in - q2_h_in;
3061         tmp0_h += q3_h_in;
3062         tmp0_h -= p4_h_in;
3063         tmp1_h += tmp0_h;
3064         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3065         out_l = __lsx_vpickev_b(out_h, out_l);
3066         q3 = __lsx_vbitsel_v(q3, out_l, flat2);
3067         __lsx_vst(q3, dst, 16*10);
3068 
3069         /* q4 */
3070         tmp0_l = q7_l_in - q3_l_in;
3071         tmp0_l += q4_l_in;
3072         tmp0_l -= p3_l_in;
3073         tmp1_l += tmp0_l;
3074         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3075         tmp0_h = q7_h_in - q3_h_in;
3076         tmp0_h += q4_h_in;
3077         tmp0_h -= p3_h_in;
3078         tmp1_h += tmp0_h;
3079         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3080         out_l = __lsx_vpickev_b(out_h, out_l);
3081         q4 = __lsx_vbitsel_v(q4, out_l, flat2);
3082         __lsx_vst(q4, dst, 16*11);
3083 
3084         /* q5 */
3085         tmp0_l = q7_l_in - q4_l_in;
3086         tmp0_l += q5_l_in;
3087         tmp0_l -= p2_l_in;
3088         tmp1_l += tmp0_l;
3089         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3090         tmp0_h = q7_h_in - q4_h_in;
3091         tmp0_h += q5_h_in;
3092         tmp0_h -= p2_h_in;
3093         tmp1_h += tmp0_h;
3094         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3095         out_l = __lsx_vpickev_b(out_h, out_l);
3096         q5 = __lsx_vbitsel_v(q5, out_l, flat2);
3097         __lsx_vst(q5, dst, 16*12);
3098 
3099         /* q6 */
3100         tmp0_l = q7_l_in - q5_l_in;
3101         tmp0_l += q6_l_in;
3102         tmp0_l -= p1_l_in;
3103         tmp1_l += tmp0_l;
3104         out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3105         tmp0_h = q7_h_in - q5_h_in;
3106         tmp0_h += q6_h_in;
3107         tmp0_h -= p1_h_in;
3108         tmp1_h += tmp0_h;
3109         out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3110         out_l = __lsx_vpickev_b(out_h, out_l);
3111         q6 = __lsx_vbitsel_v(q6, out_l, flat2);
3112         __lsx_vst(q6, dst, 16*13);
3113 
3114         return 0;
3115     }
3116 }
3117 
ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)3118 void ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride,
3119                                 int32_t b_limit_ptr,
3120                                 int32_t limit_ptr,
3121                                 int32_t thresh_ptr)
3122 {
3123     uint8_t early_exit = 0;
3124     uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16)));
3125     uint8_t *filter48 = &transposed_input[16 * 16];
3126 
3127     vp9_transpose_16x16((dst - 8), stride, &transposed_input[0], 16);
3128 
3129     early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
3130                                           &filter48[0], dst, stride,
3131                                           b_limit_ptr, limit_ptr, thresh_ptr);
3132 
3133     if (0 == early_exit) {
3134         early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), dst,
3135                                          stride, &filter48[0]);
3136 
3137         if (0 == early_exit) {
3138             vp9_transpose_16x16(transposed_input, 16, (dst - 8), stride);
3139         }
3140     }
3141 }
3142