1 /*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * Contributed by Jin Bo <jinbo@loongson.cn>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "libavcodec/vp9dsp.h"
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 #include "libavutil/common.h"
25 #include "vp9dsp_loongarch.h"
26
27 #define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, _in2, \
28 _in3, _in4, _in5, _in6, _in7) \
29 { \
30 _in0 = __lsx_vld(_src, 0); \
31 _in1 = __lsx_vldx(_src, _stride); \
32 _in2 = __lsx_vldx(_src, _stride2); \
33 _in3 = __lsx_vldx(_src, _stride3); \
34 _src += _stride4; \
35 _in4 = __lsx_vld(_src, 0); \
36 _in5 = __lsx_vldx(_src, _stride); \
37 _in6 = __lsx_vldx(_src, _stride2); \
38 _in7 = __lsx_vldx(_src, _stride3); \
39 }
40
41 #define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, \
42 _dst, _stride, _stride2, _stride3, _stride4) \
43 { \
44 __lsx_vst(_dst0, _dst, 0); \
45 __lsx_vstx(_dst1, _dst, _stride); \
46 __lsx_vstx(_dst2, _dst, _stride2); \
47 __lsx_vstx(_dst3, _dst, _stride3); \
48 _dst += _stride4; \
49 __lsx_vst(_dst4, _dst, 0); \
50 __lsx_vstx(_dst5, _dst, _stride); \
51 __lsx_vstx(_dst6, _dst, _stride2); \
52 __lsx_vstx(_dst7, _dst, _stride3); \
53 }
54
55 #define VP9_LPF_FILTER4_4W(p1_src, p0_src, q0_src, q1_src, mask_src, hev_src, \
56 p1_dst, p0_dst, q0_dst, q1_dst) \
57 { \
58 __m128i p1_tmp, p0_tmp, q0_tmp, q1_tmp, q0_sub_p0, filt, filt1, filt2; \
59 const __m128i cnst3b = __lsx_vldi(3); \
60 const __m128i cnst4b = __lsx_vldi(4); \
61 \
62 p1_tmp = __lsx_vxori_b(p1_src, 0x80); \
63 p0_tmp = __lsx_vxori_b(p0_src, 0x80); \
64 q0_tmp = __lsx_vxori_b(q0_src, 0x80); \
65 q1_tmp = __lsx_vxori_b(q1_src, 0x80); \
66 \
67 filt = __lsx_vssub_b(p1_tmp, q1_tmp); \
68 \
69 filt = filt & hev_src; \
70 \
71 q0_sub_p0 = __lsx_vssub_b(q0_tmp, p0_tmp); \
72 filt = __lsx_vsadd_b(filt, q0_sub_p0); \
73 filt = __lsx_vsadd_b(filt, q0_sub_p0); \
74 filt = __lsx_vsadd_b(filt, q0_sub_p0); \
75 filt = filt & mask_src; \
76 \
77 filt1 = __lsx_vsadd_b(filt, cnst4b); \
78 filt1 = __lsx_vsrai_b(filt1, 3); \
79 \
80 filt2 = __lsx_vsadd_b(filt, cnst3b); \
81 filt2 = __lsx_vsrai_b(filt2, 3); \
82 \
83 q0_tmp = __lsx_vssub_b(q0_tmp, filt1); \
84 q0_dst = __lsx_vxori_b(q0_tmp, 0x80); \
85 p0_tmp = __lsx_vsadd_b(p0_tmp, filt2); \
86 p0_dst = __lsx_vxori_b(p0_tmp, 0x80); \
87 \
88 filt = __lsx_vsrari_b(filt1, 1); \
89 hev_src = __lsx_vxori_b(hev_src, 0xff); \
90 filt = filt & hev_src; \
91 \
92 q1_tmp = __lsx_vssub_b(q1_tmp, filt); \
93 q1_dst = __lsx_vxori_b(q1_tmp, 0x80); \
94 p1_tmp = __lsx_vsadd_b(p1_tmp, filt); \
95 p1_dst = __lsx_vxori_b(p1_tmp, 0x80); \
96 }
97
98 #define VP9_FLAT4(p3_src, p2_src, p0_src, q0_src, q2_src, q3_src, flat_dst) \
99 { \
100 __m128i f_tmp = __lsx_vldi(1); \
101 __m128i p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
102 \
103 p2_a_sub_p0 = __lsx_vabsd_bu(p2_src, p0_src); \
104 q2_a_sub_q0 = __lsx_vabsd_bu(q2_src, q0_src); \
105 p3_a_sub_p0 = __lsx_vabsd_bu(p3_src, p0_src); \
106 q3_a_sub_q0 = __lsx_vabsd_bu(q3_src, q0_src); \
107 \
108 p2_a_sub_p0 = __lsx_vmax_bu(p2_a_sub_p0, q2_a_sub_q0); \
109 flat_dst = __lsx_vmax_bu(p2_a_sub_p0, flat_dst); \
110 p3_a_sub_p0 = __lsx_vmax_bu(p3_a_sub_p0, q3_a_sub_q0); \
111 flat_dst = __lsx_vmax_bu(p3_a_sub_p0, flat_dst); \
112 \
113 flat_dst = __lsx_vslt_bu(f_tmp, flat_dst); \
114 flat_dst = __lsx_vxori_b(flat_dst, 0xff); \
115 flat_dst = flat_dst & mask; \
116 }
117
118 #define VP9_FLAT5(p7_src, p6_src, p5_src, p4_src, p0_src, q0_src, q4_src, \
119 q5_src, q6_src, q7_src, flat_src, flat2_dst) \
120 { \
121 __m128i f_tmp = __lsx_vldi(1); \
122 __m128i p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
123 __m128i p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
124 \
125 p4_a_sub_p0 = __lsx_vabsd_bu(p4_src, p0_src); \
126 q4_a_sub_q0 = __lsx_vabsd_bu(q4_src, q0_src); \
127 p5_a_sub_p0 = __lsx_vabsd_bu(p5_src, p0_src); \
128 q5_a_sub_q0 = __lsx_vabsd_bu(q5_src, q0_src); \
129 p6_a_sub_p0 = __lsx_vabsd_bu(p6_src, p0_src); \
130 q6_a_sub_q0 = __lsx_vabsd_bu(q6_src, q0_src); \
131 p7_a_sub_p0 = __lsx_vabsd_bu(p7_src, p0_src); \
132 q7_a_sub_q0 = __lsx_vabsd_bu(q7_src, q0_src); \
133 \
134 p4_a_sub_p0 = __lsx_vmax_bu(p4_a_sub_p0, q4_a_sub_q0); \
135 flat2_dst = __lsx_vmax_bu(p5_a_sub_p0, q5_a_sub_q0); \
136 flat2_dst = __lsx_vmax_bu(p4_a_sub_p0, flat2_dst); \
137 p6_a_sub_p0 = __lsx_vmax_bu(p6_a_sub_p0, q6_a_sub_q0); \
138 flat2_dst = __lsx_vmax_bu(p6_a_sub_p0, flat2_dst); \
139 p7_a_sub_p0 = __lsx_vmax_bu(p7_a_sub_p0, q7_a_sub_q0); \
140 flat2_dst = __lsx_vmax_bu(p7_a_sub_p0, flat2_dst); \
141 \
142 flat2_dst = __lsx_vslt_bu(f_tmp, flat2_dst); \
143 flat2_dst = __lsx_vxori_b(flat2_dst, 0xff); \
144 flat2_dst = flat2_dst & flat_src; \
145 }
146
147 #define VP9_FILTER8(p3_src, p2_src, p1_src, p0_src, \
148 q0_src, q1_src, q2_src, q3_src, \
149 p2_filt8_dst, p1_filt8_dst, p0_filt8_dst, \
150 q0_filt8_dst, q1_filt8_dst, q2_filt8_dst) \
151 { \
152 __m128i tmp0, tmp1, tmp2; \
153 \
154 tmp2 = __lsx_vadd_h(p2_src, p1_src); \
155 tmp2 = __lsx_vadd_h(tmp2, p0_src); \
156 tmp0 = __lsx_vslli_h(p3_src, 1); \
157 \
158 tmp0 = __lsx_vadd_h(tmp0, tmp2); \
159 tmp0 = __lsx_vadd_h(tmp0, q0_src); \
160 tmp1 = __lsx_vadd_h(tmp0, p3_src); \
161 tmp1 = __lsx_vadd_h(tmp1, p2_src); \
162 p2_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
163 \
164 tmp1 = __lsx_vadd_h(tmp0, p1_src); \
165 tmp1 = __lsx_vadd_h(tmp1, q1_src); \
166 p1_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
167 \
168 tmp1 = __lsx_vadd_h(q2_src, q1_src); \
169 tmp1 = __lsx_vadd_h(tmp1, q0_src); \
170 tmp2 = __lsx_vadd_h(tmp2, tmp1); \
171 tmp0 = __lsx_vadd_h(tmp2, p0_src); \
172 tmp0 = __lsx_vadd_h(tmp0, p3_src); \
173 p0_filt8_dst = __lsx_vsrari_h(tmp0, 3); \
174 \
175 tmp0 = __lsx_vadd_h(q2_src, q3_src); \
176 tmp0 = __lsx_vadd_h(tmp0, p0_src); \
177 tmp0 = __lsx_vadd_h(tmp0, tmp1); \
178 tmp1 = __lsx_vadd_h(q3_src, q3_src); \
179 tmp1 = __lsx_vadd_h(tmp1, tmp0); \
180 q2_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
181 \
182 tmp0 = __lsx_vadd_h(tmp2, q3_src); \
183 tmp1 = __lsx_vadd_h(tmp0, q0_src); \
184 q0_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
185 \
186 tmp1 = __lsx_vsub_h(tmp0, p2_src); \
187 tmp0 = __lsx_vadd_h(q1_src, q3_src); \
188 tmp1 = __lsx_vadd_h(tmp0, tmp1); \
189 q1_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
190 }
191
192 #define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, \
193 q2_src, q3_src, limit_src, b_limit_src, thresh_src, \
194 hev_dst, mask_dst, flat_dst) \
195 { \
196 __m128i p3_asub_p2_tmp, p2_asub_p1_tmp, p1_asub_p0_tmp, q1_asub_q0_tmp; \
197 __m128i p1_asub_q1_tmp, p0_asub_q0_tmp, q3_asub_q2_tmp, q2_asub_q1_tmp; \
198 \
199 /* absolute subtraction of pixel values */ \
200 p3_asub_p2_tmp = __lsx_vabsd_bu(p3_src, p2_src); \
201 p2_asub_p1_tmp = __lsx_vabsd_bu(p2_src, p1_src); \
202 p1_asub_p0_tmp = __lsx_vabsd_bu(p1_src, p0_src); \
203 q1_asub_q0_tmp = __lsx_vabsd_bu(q1_src, q0_src); \
204 q2_asub_q1_tmp = __lsx_vabsd_bu(q2_src, q1_src); \
205 q3_asub_q2_tmp = __lsx_vabsd_bu(q3_src, q2_src); \
206 p0_asub_q0_tmp = __lsx_vabsd_bu(p0_src, q0_src); \
207 p1_asub_q1_tmp = __lsx_vabsd_bu(p1_src, q1_src); \
208 \
209 /* calculation of hev */ \
210 flat_dst = __lsx_vmax_bu(p1_asub_p0_tmp, q1_asub_q0_tmp); \
211 hev_dst = __lsx_vslt_bu(thresh_src, flat_dst); \
212 \
213 /* calculation of mask */ \
214 p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p0_asub_q0_tmp); \
215 p1_asub_q1_tmp = __lsx_vsrli_b(p1_asub_q1_tmp, 1); \
216 p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p1_asub_q1_tmp); \
217 \
218 mask_dst = __lsx_vslt_bu(b_limit_src, p0_asub_q0_tmp); \
219 mask_dst = __lsx_vmax_bu(flat_dst, mask_dst); \
220 p3_asub_p2_tmp = __lsx_vmax_bu(p3_asub_p2_tmp, p2_asub_p1_tmp); \
221 mask_dst = __lsx_vmax_bu(p3_asub_p2_tmp, mask_dst); \
222 q2_asub_q1_tmp = __lsx_vmax_bu(q2_asub_q1_tmp, q3_asub_q2_tmp); \
223 mask_dst = __lsx_vmax_bu(q2_asub_q1_tmp, mask_dst); \
224 \
225 mask_dst = __lsx_vslt_bu(limit_src, mask_dst); \
226 mask_dst = __lsx_vxori_b(mask_dst, 0xff); \
227 }
228
ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)229 void ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride,
230 int32_t b_limit_ptr,
231 int32_t limit_ptr,
232 int32_t thresh_ptr)
233 {
234 ptrdiff_t stride2 = stride << 1;
235 ptrdiff_t stride3 = stride2 + stride;
236 ptrdiff_t stride4 = stride2 << 1;
237 __m128i mask, hev, flat, thresh, b_limit, limit;
238 __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
239
240 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
241 dst, -stride, p3, p2, p1, p0);
242 q0 = __lsx_vld(dst, 0);
243 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
244 q3 = __lsx_vldx(dst, stride3);
245
246 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
247 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
248 limit = __lsx_vreplgr2vr_b(limit_ptr);
249
250 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
251 hev, mask, flat);
252
253 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
254 q1_out);
255
256 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
257 __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
258 __lsx_vstelm_d(q0_out, dst , 0, 0);
259 __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
260 }
261
ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)262 void ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride,
263 int32_t b_limit_ptr,
264 int32_t limit_ptr,
265 int32_t thresh_ptr)
266 {
267 ptrdiff_t stride2 = stride << 1;
268 ptrdiff_t stride3 = stride2 + stride;
269 ptrdiff_t stride4 = stride2 << 1;
270 __m128i mask, hev, flat, thresh0, b_limit0;
271 __m128i limit0, thresh1, b_limit1, limit1;
272 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
273
274 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
275 dst, -stride, p3, p2, p1, p0);
276 q0 = __lsx_vld(dst, 0);
277 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
278 q3 = __lsx_vldx(dst, stride3);
279
280 thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
281 thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
282 thresh0 = __lsx_vilvl_d(thresh1, thresh0);
283
284 b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
285 b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
286 b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
287
288 limit0 = __lsx_vreplgr2vr_b(limit_ptr);
289 limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
290 limit0 = __lsx_vilvl_d(limit1, limit0);
291
292 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
293 hev, mask, flat);
294 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
295
296 __lsx_vst(p1, dst - stride2, 0);
297 __lsx_vst(p0, dst - stride, 0);
298 __lsx_vst(q0, dst , 0);
299 __lsx_vst(q1, dst + stride, 0);
300 }
301
ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)302 void ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride,
303 int32_t b_limit_ptr,
304 int32_t limit_ptr,
305 int32_t thresh_ptr)
306 {
307 ptrdiff_t stride2 = stride << 1;
308 ptrdiff_t stride3 = stride2 + stride;
309 ptrdiff_t stride4 = stride2 << 1;
310 __m128i mask, hev, flat, thresh, b_limit, limit;
311 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
312 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
313 __m128i p2_filter8, p1_filter8, p0_filter8;
314 __m128i q0_filter8, q1_filter8, q2_filter8;
315 __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
316 __m128i zero = __lsx_vldi(0);
317
318 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
319 dst, -stride, p3, p2, p1, p0);
320 q0 = __lsx_vld(dst, 0);
321 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
322 q3 = __lsx_vldx(dst, stride3);
323
324 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
325 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
326 limit = __lsx_vreplgr2vr_b(limit_ptr);
327
328 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
329 hev, mask, flat);
330 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
331 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
332 q1_out);
333
334 flat = __lsx_vilvl_d(zero, flat);
335
336 /* if flat is zero for all pixels, then no need to calculate other filter */
337 if (__lsx_bz_v(flat)) {
338 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
339 __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
340 __lsx_vstelm_d(q0_out, dst , 0, 0);
341 __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
342 } else {
343 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
344 p3_l, p2_l, p1_l, p0_l);
345 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
346 q0_l, q1_l, q2_l, q3_l);
347 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
348 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
349
350 /* convert 16 bit output data into 8 bit */
351 DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8,
352 zero, p0_filter8, zero, q0_filter8, p2_filter8,
353 p1_filter8, p0_filter8, q0_filter8);
354 DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8,
355 q1_filter8, q2_filter8);
356
357 /* store pixel values */
358 p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
359 p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
360 p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
361 q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
362 q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
363 q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
364
365 __lsx_vstelm_d(p2_out, dst - stride3, 0, 0);
366 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
367 __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
368 __lsx_vstelm_d(q0_out, dst, 0, 0);
369 __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
370 __lsx_vstelm_d(q2_out, dst + stride2, 0, 0);
371 }
372 }
373
ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)374 void ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride,
375 int32_t b_limit_ptr,
376 int32_t limit_ptr,
377 int32_t thresh_ptr)
378 {
379 ptrdiff_t stride2 = stride << 1;
380 ptrdiff_t stride3 = stride2 + stride;
381 ptrdiff_t stride4 = stride2 << 1;
382 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
383 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
384 __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
385 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
386 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
387 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
388 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
389 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
390 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
391 __m128i zero = __lsx_vldi(0);
392
393 /* load vector elements */
394 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
395 dst, -stride, p3, p2, p1, p0);
396 q0 = __lsx_vld(dst, 0);
397 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
398 q3 = __lsx_vldx(dst, stride3);
399
400 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
401 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
402 thresh = __lsx_vilvl_d(tmp, thresh);
403
404 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
405 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
406 b_limit = __lsx_vilvl_d(tmp, b_limit);
407
408 limit = __lsx_vreplgr2vr_b(limit_ptr);
409 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
410 limit = __lsx_vilvl_d(tmp, limit);
411
412 /* mask and hev */
413 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
414 hev, mask, flat);
415 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
416 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
417 q1_out);
418
419 /* if flat is zero for all pixels, then no need to calculate other filter */
420 if (__lsx_bz_v(flat)) {
421 __lsx_vst(p1_out, dst - stride2, 0);
422 __lsx_vst(p0_out, dst - stride, 0);
423 __lsx_vst(q0_out, dst, 0);
424 __lsx_vst(q1_out, dst + stride, 0);
425 } else {
426 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
427 p3_l, p2_l, p1_l, p0_l);
428 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
429 q0_l, q1_l, q2_l, q3_l);
430 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
431 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
432
433 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
434 p3_h, p2_h, p1_h, p0_h);
435 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
436 q0_h, q1_h, q2_h, q3_h);
437 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
438 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
439
440 /* convert 16 bit output data into 8 bit */
441 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
442 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
443 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
444 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
445 q2_filt8_l, q1_filt8_l, q2_filt8_l);
446
447 /* store pixel values */
448 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
449 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
450 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
451 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
452 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
453 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
454
455
456 __lsx_vstx(p2_out, dst, -stride3);
457 __lsx_vstx(p1_out, dst, -stride2);
458 __lsx_vstx(p0_out, dst, -stride);
459 __lsx_vst(q0_out, dst, 0);
460 __lsx_vstx(q1_out, dst, stride);
461 __lsx_vstx(q2_out, dst, stride2);
462 }
463 }
464
ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)465 void ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride,
466 int32_t b_limit_ptr,
467 int32_t limit_ptr,
468 int32_t thresh_ptr)
469 {
470 ptrdiff_t stride2 = stride << 1;
471 ptrdiff_t stride3 = stride2 + stride;
472 ptrdiff_t stride4 = stride2 << 1;
473 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
474 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
475 __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
476 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
477 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
478 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
479 __m128i zero = __lsx_vldi(0);
480
481 /* load vector elements */
482 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
483 dst, -stride, p3, p2, p1, p0);
484 q0 = __lsx_vld(dst, 0);
485 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
486 q3 = __lsx_vldx(dst, stride3);
487
488 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
489 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
490 thresh = __lsx_vilvl_d(tmp, thresh);
491
492 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
493 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
494 b_limit = __lsx_vilvl_d(tmp, b_limit);
495
496 limit = __lsx_vreplgr2vr_b(limit_ptr);
497 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
498 limit = __lsx_vilvl_d(tmp, limit);
499
500 /* mask and hev */
501 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
502 hev, mask, flat);
503 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
504 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
505 q1_out);
506
507 flat = __lsx_vilvl_d(zero, flat);
508
509 /* if flat is zero for all pixels, then no need to calculate other filter */
510 if (__lsx_bz_v(flat)) {
511 __lsx_vstx(p1_out, dst, -stride2);
512 __lsx_vstx(p0_out, dst, -stride);
513 __lsx_vst(q0_out, dst, 0);
514 __lsx_vstx(q1_out, dst, stride);
515 } else {
516 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
517 p3_l, p2_l, p1_l, p0_l);
518 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
519 q0_l, q1_l, q2_l, q3_l);
520 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
521 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
522
523 /* convert 16 bit output data into 8 bit */
524 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
525 p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
526 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
527 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
528 q2_filt8_l, q1_filt8_l, q2_filt8_l);
529
530 /* store pixel values */
531 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
532 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
533 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
534 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
535 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
536 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
537
538 __lsx_vstx(p2_out, dst, -stride3);
539 __lsx_vstx(p1_out, dst, -stride2);
540 __lsx_vstx(p0_out, dst, -stride);
541 __lsx_vst(q0_out, dst, 0);
542 __lsx_vstx(q1_out, dst, stride);
543 __lsx_vstx(q2_out, dst, stride2);
544 }
545 }
546
ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)547 void ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride,
548 int32_t b_limit_ptr,
549 int32_t limit_ptr,
550 int32_t thresh_ptr)
551 {
552 ptrdiff_t stride2 = stride << 1;
553 ptrdiff_t stride3 = stride2 + stride;
554 ptrdiff_t stride4 = stride2 << 1;
555 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
556 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
557 __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
558 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
559 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
560 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
561 __m128i zero = { 0 };
562
563 /* load vector elements */
564 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
565 dst, -stride, p3, p2, p1, p0);
566 q0 = __lsx_vld(dst, 0);
567 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
568 q3 = __lsx_vldx(dst, stride3);
569
570 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
571 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
572 thresh = __lsx_vilvl_d(tmp, thresh);
573
574 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
575 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
576 b_limit = __lsx_vilvl_d(tmp, b_limit);
577
578 limit = __lsx_vreplgr2vr_b(limit_ptr);
579 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
580 limit = __lsx_vilvl_d(tmp, limit);
581
582 /* mask and hev */
583 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
584 hev, mask, flat);
585 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
586 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
587 q1_out);
588
589 flat = __lsx_vilvh_d(flat, zero);
590
591 /* if flat is zero for all pixels, then no need to calculate other filter */
592 if (__lsx_bz_v(flat)) {
593 __lsx_vstx(p1_out, dst, -stride2);
594 __lsx_vstx(p0_out, dst, -stride);
595 __lsx_vst(q0_out, dst, 0);
596 __lsx_vstx(q1_out, dst, stride);
597 } else {
598 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
599 p3_h, p2_h, p1_h, p0_h);
600 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
601 q0_h, q1_h, q2_h, q3_h);
602 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
603 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
604
605 /* convert 16 bit output data into 8 bit */
606 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
607 p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
608 p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
609 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
610 q2_filt8_h, q1_filt8_h, q2_filt8_h);
611
612 /* store pixel values */
613 p2_out = __lsx_vbitsel_v(p2, p2_filt8_h, flat);
614 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat);
615 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat);
616 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat);
617 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat);
618 q2_out = __lsx_vbitsel_v(q2, q2_filt8_h, flat);
619
620 __lsx_vstx(p2_out, dst, -stride3);
621 __lsx_vstx(p1_out, dst, -stride2);
622 __lsx_vstx(p0_out, dst, -stride);
623 __lsx_vst(q0_out, dst, 0);
624 __lsx_vstx(q1_out, dst, stride);
625 __lsx_vstx(q2_out, dst, stride2);
626 }
627 }
628
vp9_hz_lpf_t4_and_t8_16w(uint8_t *dst, ptrdiff_t stride, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)629 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *dst, ptrdiff_t stride,
630 uint8_t *filter48,
631 int32_t b_limit_ptr,
632 int32_t limit_ptr,
633 int32_t thresh_ptr)
634 {
635 ptrdiff_t stride2 = stride << 1;
636 ptrdiff_t stride3 = stride2 + stride;
637 ptrdiff_t stride4 = stride2 << 1;
638 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
639 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
640 __m128i flat, mask, hev, thresh, b_limit, limit;
641 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
642 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
643 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
644 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
645 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
646 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
647 __m128i zero = __lsx_vldi(0);
648
649 /* load vector elements */
650 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
651 dst, -stride, p3, p2, p1, p0);
652 q0 = __lsx_vld(dst, 0);
653 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
654 q3 = __lsx_vldx(dst, stride3);
655
656 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
657 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
658 limit = __lsx_vreplgr2vr_b(limit_ptr);
659
660 /* mask and hev */
661 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
662 hev, mask, flat);
663 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
664 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
665 q1_out);
666
667 /* if flat is zero for all pixels, then no need to calculate other filter */
668 if (__lsx_bz_v(flat)) {
669 __lsx_vstx(p1_out, dst, -stride2);
670 __lsx_vstx(p0_out, dst, -stride);
671 __lsx_vst(q0_out, dst, 0);
672 __lsx_vstx(q1_out, dst, stride);
673 return 1;
674 } else {
675 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
676 p3_l, p2_l, p1_l, p0_l);
677 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
678 q0_l, q1_l, q2_l, q3_l);
679 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
680 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
681
682 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
683 p3_h, p2_h, p1_h, p0_h);
684 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
685 q0_h, q1_h, q2_h, q3_h);
686 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
687 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
688
689 /* convert 16 bit output data into 8 bit */
690 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
691 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
692 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
693 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
694 q2_filt8_l, q1_filt8_l, q2_filt8_l);
695
696 /* store pixel values */
697 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
698 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
699 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
700 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
701 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
702 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
703
704 __lsx_vst(p2_out, filter48, 0);
705 __lsx_vst(p1_out, filter48, 16);
706 __lsx_vst(p0_out, filter48, 32);
707 __lsx_vst(q0_out, filter48, 48);
708 __lsx_vst(q1_out, filter48, 64);
709 __lsx_vst(q2_out, filter48, 80);
710 __lsx_vst(flat, filter48, 96);
711
712 return 0;
713 }
714 }
715
vp9_hz_lpf_t16_16w(uint8_t *dst, ptrdiff_t stride, uint8_t *filter48)716 static void vp9_hz_lpf_t16_16w(uint8_t *dst, ptrdiff_t stride,
717 uint8_t *filter48)
718 {
719 ptrdiff_t stride2 = stride << 1;
720 ptrdiff_t stride3 = stride2 + stride;
721 ptrdiff_t stride4 = stride2 << 1;
722 uint8_t *dst_tmp = dst - stride4;
723 uint8_t *dst_tmp1 = dst + stride4;
724 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
725 __m128i flat, flat2, filter8;
726 __m128i zero = __lsx_vldi(0);
727 __m128i out_h, out_l;
728 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
729 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
730 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
731 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
732 v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
733 v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
734 v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
735 v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
736 v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
737
738 flat = __lsx_vld(filter48, 96);
739
740 DUP4_ARG2(__lsx_vldx, dst_tmp, -stride4, dst_tmp, -stride3, dst_tmp,
741 -stride2, dst_tmp, -stride, p7, p6, p5, p4);
742 p3 = __lsx_vld(dst_tmp, 0);
743 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
744 p0 = __lsx_vldx(dst_tmp, stride3);
745
746 q0 = __lsx_vld(dst, 0);
747 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
748 q3 = __lsx_vldx(dst, stride3);
749
750 q4 = __lsx_vld(dst_tmp1, 0);
751 DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
752 q7 = __lsx_vldx(dst_tmp1, stride3);
753 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
754
755 /* if flat2 is zero for all pixels, then no need to calculate other filter */
756 if (__lsx_bz_v(flat2)) {
757 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48,
758 48, p2, p1, p0, q0);
759 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
760
761 __lsx_vstx(p2, dst, -stride3);
762 __lsx_vstx(p1, dst, -stride2);
763 __lsx_vstx(p0, dst, -stride);
764 __lsx_vst(q0, dst, 0);
765 __lsx_vstx(q1, dst, stride);
766 __lsx_vstx(q2, dst, stride2);
767 } else {
768 dst = dst_tmp - stride3;
769
770 p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
771 p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
772 p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
773 p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
774 p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
775 p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
776 p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
777 p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
778
779 q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
780
781 tmp0_l = p7_l_in << 3;
782 tmp0_l -= p7_l_in;
783 tmp0_l += p6_l_in;
784 tmp0_l += q0_l_in;
785 tmp1_l = p6_l_in + p5_l_in;
786 tmp1_l += p4_l_in;
787 tmp1_l += p3_l_in;
788 tmp1_l += p2_l_in;
789 tmp1_l += p1_l_in;
790 tmp1_l += p0_l_in;
791 tmp1_l += tmp0_l;
792
793 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
794
795 p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
796 p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
797 p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
798 p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
799
800 p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
801 p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
802 p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
803 p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
804 q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
805
806 tmp0_h = p7_h_in << 3;
807 tmp0_h -= p7_h_in;
808 tmp0_h += p6_h_in;
809 tmp0_h += q0_h_in;
810 tmp1_h = p6_h_in + p5_h_in;
811 tmp1_h += p4_h_in;
812 tmp1_h += p3_h_in;
813 tmp1_h += p2_h_in;
814 tmp1_h += p1_h_in;
815 tmp1_h += p0_h_in;
816 tmp1_h += tmp0_h;
817
818 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
819
820 out_l = __lsx_vpickev_b(out_h, out_l);
821 p6 = __lsx_vbitsel_v(p6, out_l, flat2);
822 __lsx_vst(p6, dst, 0);
823 dst += stride;
824
825 /* p5 */
826 q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
827 tmp0_l = p5_l_in - p6_l_in;
828 tmp0_l += q1_l_in;
829 tmp0_l -= p7_l_in;
830 tmp1_l += tmp0_l;
831 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
832
833 q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
834 tmp0_h = p5_h_in - p6_h_in;
835 tmp0_h += q1_h_in;
836 tmp0_h -= p7_h_in;
837 tmp1_h += tmp0_h;
838 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
839
840 out_l = __lsx_vpickev_b(out_h, out_l);
841 p5 = __lsx_vbitsel_v(p5, out_l, flat2);
842 __lsx_vst(p5, dst, 0);
843 dst += stride;
844
845 /* p4 */
846 q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
847 tmp0_l = p4_l_in - p5_l_in;
848 tmp0_l += q2_l_in;
849 tmp0_l -= p7_l_in;
850 tmp1_l += tmp0_l;
851 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
852
853 q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
854 tmp0_h = p4_h_in - p5_h_in;
855 tmp0_h += q2_h_in;
856 tmp0_h -= p7_h_in;
857 tmp1_h += tmp0_h;
858 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
859
860 out_l = __lsx_vpickev_b(out_h, out_l);
861 p4 = __lsx_vbitsel_v(p4, out_l, flat2);
862 __lsx_vst(p4, dst, 0);
863 dst += stride;
864
865 /* p3 */
866 q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
867 tmp0_l = p3_l_in - p4_l_in;
868 tmp0_l += q3_l_in;
869 tmp0_l -= p7_l_in;
870 tmp1_l += tmp0_l;
871 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
872
873 q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
874 tmp0_h = p3_h_in - p4_h_in;
875 tmp0_h += q3_h_in;
876 tmp0_h -= p7_h_in;
877 tmp1_h += tmp0_h;
878 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
879
880 out_l = __lsx_vpickev_b(out_h, out_l);
881 p3 = __lsx_vbitsel_v(p3, out_l, flat2);
882 __lsx_vst(p3, dst, 0);
883 dst += stride;
884
885 /* p2 */
886 q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
887 filter8 = __lsx_vld(filter48, 0);
888 tmp0_l = p2_l_in - p3_l_in;
889 tmp0_l += q4_l_in;
890 tmp0_l -= p7_l_in;
891 tmp1_l += tmp0_l;
892 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
893
894 q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
895 tmp0_h = p2_h_in - p3_h_in;
896 tmp0_h += q4_h_in;
897 tmp0_h -= p7_h_in;
898 tmp1_h += tmp0_h;
899 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
900
901 out_l = __lsx_vpickev_b(out_h, out_l);
902 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
903 __lsx_vst(filter8, dst, 0);
904 dst += stride;
905
906 /* p1 */
907 q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
908 filter8 = __lsx_vld(filter48, 16);
909 tmp0_l = p1_l_in - p2_l_in;
910 tmp0_l += q5_l_in;
911 tmp0_l -= p7_l_in;
912 tmp1_l += tmp0_l;
913 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
914
915 q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
916 tmp0_h = p1_h_in - p2_h_in;
917 tmp0_h += q5_h_in;
918 tmp0_h -= p7_h_in;
919 tmp1_h += tmp0_h;
920 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
921
922 out_l = __lsx_vpickev_b(out_h, out_l);
923 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
924 __lsx_vst(filter8, dst, 0);
925 dst += stride;
926
927 /* p0 */
928 q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
929 filter8 = __lsx_vld(filter48, 32);
930 tmp0_l = p0_l_in - p1_l_in;
931 tmp0_l += q6_l_in;
932 tmp0_l -= p7_l_in;
933 tmp1_l += tmp0_l;
934 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
935
936 q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
937 tmp0_h = p0_h_in - p1_h_in;
938 tmp0_h += q6_h_in;
939 tmp0_h -= p7_h_in;
940 tmp1_h += tmp0_h;
941 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
942
943 out_l = __lsx_vpickev_b(out_h, out_l);
944 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
945 __lsx_vst(filter8, dst, 0);
946 dst += stride;
947
948 /* q0 */
949 q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
950 filter8 = __lsx_vld(filter48, 48);
951 tmp0_l = q7_l_in - p0_l_in;
952 tmp0_l += q0_l_in;
953 tmp0_l -= p7_l_in;
954 tmp1_l += tmp0_l;
955 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
956
957 q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
958 tmp0_h = q7_h_in - p0_h_in;
959 tmp0_h += q0_h_in;
960 tmp0_h -= p7_h_in;
961 tmp1_h += tmp0_h;
962 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
963
964 out_l = __lsx_vpickev_b(out_h, out_l);
965 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
966 __lsx_vst(filter8, dst, 0);
967 dst += stride;
968
969 /* q1 */
970 filter8 = __lsx_vld(filter48, 64);
971 tmp0_l = q7_l_in - q0_l_in;
972 tmp0_l += q1_l_in;
973 tmp0_l -= p6_l_in;
974 tmp1_l += tmp0_l;
975 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
976
977 tmp0_h = q7_h_in - q0_h_in;
978 tmp0_h += q1_h_in;
979 tmp0_h -= p6_h_in;
980 tmp1_h += tmp0_h;
981 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
982
983 out_l = __lsx_vpickev_b(out_h, out_l);
984 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
985 __lsx_vst(filter8, dst, 0);
986 dst += stride;
987
988 /* q2 */
989 filter8 = __lsx_vld(filter48, 80);
990 tmp0_l = q7_l_in - q1_l_in;
991 tmp0_l += q2_l_in;
992 tmp0_l -= p5_l_in;
993 tmp1_l += tmp0_l;
994 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
995
996 tmp0_h = q7_h_in - q1_h_in;
997 tmp0_h += q2_h_in;
998 tmp0_h -= p5_h_in;
999 tmp1_h += tmp0_h;
1000 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1001
1002 out_l = __lsx_vpickev_b(out_h, out_l);
1003 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
1004 __lsx_vst(filter8, dst, 0);
1005 dst += stride;
1006
1007 /* q3 */
1008 tmp0_l = q7_l_in - q2_l_in;
1009 tmp0_l += q3_l_in;
1010 tmp0_l -= p4_l_in;
1011 tmp1_l += tmp0_l;
1012 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1013
1014 tmp0_h = q7_h_in - q2_h_in;
1015 tmp0_h += q3_h_in;
1016 tmp0_h -= p4_h_in;
1017 tmp1_h += tmp0_h;
1018 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1019
1020 out_l = __lsx_vpickev_b(out_h, out_l);
1021 q3 = __lsx_vbitsel_v(q3, out_l, flat2);
1022 __lsx_vst(q3, dst, 0);
1023 dst += stride;
1024
1025 /* q4 */
1026 tmp0_l = q7_l_in - q3_l_in;
1027 tmp0_l += q4_l_in;
1028 tmp0_l -= p3_l_in;
1029 tmp1_l += tmp0_l;
1030 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1031
1032 tmp0_h = q7_h_in - q3_h_in;
1033 tmp0_h += q4_h_in;
1034 tmp0_h -= p3_h_in;
1035 tmp1_h += tmp0_h;
1036 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1037
1038 out_l = __lsx_vpickev_b(out_h, out_l);
1039 q4 = __lsx_vbitsel_v(q4, out_l, flat2);
1040 __lsx_vst(q4, dst, 0);
1041 dst += stride;
1042
1043 /* q5 */
1044 tmp0_l = q7_l_in - q4_l_in;
1045 tmp0_l += q5_l_in;
1046 tmp0_l -= p2_l_in;
1047 tmp1_l += tmp0_l;
1048 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1049
1050 tmp0_h = q7_h_in - q4_h_in;
1051 tmp0_h += q5_h_in;
1052 tmp0_h -= p2_h_in;
1053 tmp1_h += tmp0_h;
1054 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1055
1056 out_l = __lsx_vpickev_b(out_h, out_l);
1057 q5 = __lsx_vbitsel_v(q5, out_l, flat2);
1058 __lsx_vst(q5, dst, 0);
1059 dst += stride;
1060
1061 /* q6 */
1062 tmp0_l = q7_l_in - q5_l_in;
1063 tmp0_l += q6_l_in;
1064 tmp0_l -= p1_l_in;
1065 tmp1_l += tmp0_l;
1066 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1067
1068 tmp0_h = q7_h_in - q5_h_in;
1069 tmp0_h += q6_h_in;
1070 tmp0_h -= p1_h_in;
1071 tmp1_h += tmp0_h;
1072 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1073
1074 out_l = __lsx_vpickev_b(out_h, out_l);
1075 q6 = __lsx_vbitsel_v(q6, out_l, flat2);
1076 __lsx_vst(q6, dst, 0);
1077 }
1078 }
1079
ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1080 void ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride,
1081 int32_t b_limit_ptr,
1082 int32_t limit_ptr,
1083 int32_t thresh_ptr)
1084 {
1085 uint8_t filter48[16 * 8] __attribute__ ((aligned(16)));
1086 uint8_t early_exit = 0;
1087
1088 early_exit = vp9_hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0],
1089 b_limit_ptr, limit_ptr, thresh_ptr);
1090
1091 if (0 == early_exit) {
1092 vp9_hz_lpf_t16_16w(dst, stride, filter48);
1093 }
1094 }
1095
ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1096 void ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride,
1097 int32_t b_limit_ptr,
1098 int32_t limit_ptr,
1099 int32_t thresh_ptr)
1100 {
1101 ptrdiff_t stride2 = stride << 1;
1102 ptrdiff_t stride3 = stride2 + stride;
1103 ptrdiff_t stride4 = stride2 << 1;
1104 uint8_t *dst_tmp = dst - stride4;
1105 uint8_t *dst_tmp1 = dst + stride4;
1106 __m128i zero = __lsx_vldi(0);
1107 __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
1108 __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
1109 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1110 __m128i p0_filter16, p1_filter16;
1111 __m128i p2_filter8, p1_filter8, p0_filter8;
1112 __m128i q0_filter8, q1_filter8, q2_filter8;
1113 __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
1114 __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
1115 __m128i tmp0, tmp1, tmp2;
1116
1117 /* load vector elements */
1118 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
1119 dst, -stride, p3, p2, p1, p0);
1120 q0 = __lsx_vld(dst, 0);
1121 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
1122 q3 = __lsx_vldx(dst, stride3);
1123
1124 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1125 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1126 limit = __lsx_vreplgr2vr_b(limit_ptr);
1127
1128 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1129 hev, mask, flat);
1130 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1131 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1132 q1_out);
1133
1134 flat = __lsx_vilvl_d(zero, flat);
1135
1136 /* if flat is zero for all pixels, then no need to calculate other filter */
1137 if (__lsx_bz_v(flat)) {
1138 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
1139 __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
1140 __lsx_vstelm_d(q0_out, dst , 0, 0);
1141 __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
1142 } else {
1143 /* convert 8 bit input data into 16 bit */
1144 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1145 p3_l, p2_l, p1_l, p0_l);
1146 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1147 q0_l, q1_l, q2_l, q3_l);
1148 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l,
1149 p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1150 q1_filter8, q2_filter8);
1151
1152 /* convert 16 bit output data into 8 bit */
1153 DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8,
1154 zero, p0_filter8, zero, q0_filter8, p2_filter8,
1155 p1_filter8, p0_filter8, q0_filter8);
1156 DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8,
1157 q1_filter8, q2_filter8);
1158
1159 /* store pixel values */
1160 p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
1161 p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
1162 p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
1163 q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
1164 q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
1165 q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
1166
1167 /* load 16 vector elements */
1168 DUP4_ARG2(__lsx_vld, dst_tmp - stride4, 0, dst_tmp - stride3, 0,
1169 dst_tmp - stride2, 0, dst_tmp - stride, 0, p7, p6, p5, p4);
1170 DUP4_ARG2(__lsx_vld, dst_tmp1, 0, dst_tmp1 + stride, 0,
1171 dst_tmp1 + stride2, 0, dst_tmp1 + stride3, 0, q4, q5, q6, q7);
1172
1173 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1174
1175 /* if flat2 is zero for all pixels, then no need to calculate other filter */
1176 if (__lsx_bz_v(flat2)) {
1177 dst -= stride3;
1178 __lsx_vstelm_d(p2_out, dst, 0, 0);
1179 dst += stride;
1180 __lsx_vstelm_d(p1_out, dst, 0, 0);
1181 dst += stride;
1182 __lsx_vstelm_d(p0_out, dst, 0, 0);
1183 dst += stride;
1184 __lsx_vstelm_d(q0_out, dst, 0, 0);
1185 dst += stride;
1186 __lsx_vstelm_d(q1_out, dst, 0, 0);
1187 dst += stride;
1188 __lsx_vstelm_d(q2_out, dst, 0, 0);
1189 } else {
1190 /* LSB(right) 8 pixel operation */
1191 DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4,
1192 p7_l, p6_l, p5_l, p4_l);
1193 DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7,
1194 q4_l, q5_l, q6_l, q7_l);
1195
1196 tmp0 = __lsx_vslli_h(p7_l, 3);
1197 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1198 tmp0 = __lsx_vadd_h(tmp0, p6_l);
1199 tmp0 = __lsx_vadd_h(tmp0, q0_l);
1200
1201 dst = dst_tmp - stride3;
1202
1203 /* calculation of p6 and p5 */
1204 tmp1 = __lsx_vadd_h(p6_l, p5_l);
1205 tmp1 = __lsx_vadd_h(tmp1, p4_l);
1206 tmp1 = __lsx_vadd_h(tmp1, p3_l);
1207 tmp1 = __lsx_vadd_h(tmp1, p2_l);
1208 tmp1 = __lsx_vadd_h(tmp1, p1_l);
1209 tmp1 = __lsx_vadd_h(tmp1, p0_l);
1210 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1211
1212 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1213 tmp0 = __lsx_vsub_h(p5_l, p6_l);
1214 tmp0 = __lsx_vadd_h(tmp0, q1_l);
1215 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1216 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1217
1218 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1219 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1220 p1_filter16, p0_filter16, p1_filter16);
1221 p0_filter16 = __lsx_vbitsel_v(p6, p0_filter16, flat2);
1222 p1_filter16 = __lsx_vbitsel_v(p5, p1_filter16, flat2);
1223 __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1224 dst += stride;
1225 __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1226 dst += stride;
1227
1228 /* calculation of p4 and p3 */
1229 tmp0 = __lsx_vsub_h(p4_l, p5_l);
1230 tmp0 = __lsx_vadd_h(tmp0, q2_l);
1231 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1232 tmp2 = __lsx_vsub_h(p3_l, p4_l);
1233 tmp2 = __lsx_vadd_h(tmp2, q3_l);
1234 tmp2 = __lsx_vsub_h(tmp2, p7_l);
1235 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1236 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1237 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1238 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1239 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1240 p1_filter16, p0_filter16, p1_filter16);
1241 p0_filter16 = __lsx_vbitsel_v(p4, p0_filter16, flat2);
1242 p1_filter16 = __lsx_vbitsel_v(p3, p1_filter16, flat2);
1243 __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1244 dst += stride;
1245 __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1246 dst += stride;
1247
1248 /* calculation of p2 and p1 */
1249 tmp0 = __lsx_vsub_h(p2_l, p3_l);
1250 tmp0 = __lsx_vadd_h(tmp0, q4_l);
1251 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1252 tmp2 = __lsx_vsub_h(p1_l, p2_l);
1253 tmp2 = __lsx_vadd_h(tmp2, q5_l);
1254 tmp2 = __lsx_vsub_h(tmp2, p7_l);
1255 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1256 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1257 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1258 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1259 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1260 p1_filter16, p0_filter16, p1_filter16);
1261 p0_filter16 = __lsx_vbitsel_v(p2_out, p0_filter16, flat2);
1262 p1_filter16 = __lsx_vbitsel_v(p1_out, p1_filter16, flat2);
1263 __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1264 dst += stride;
1265 __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1266 dst += stride;
1267
1268 /* calculation of p0 and q0 */
1269 tmp0 = __lsx_vsub_h(p0_l, p1_l);
1270 tmp0 = __lsx_vadd_h(tmp0, q6_l);
1271 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1272 tmp2 = __lsx_vsub_h(q7_l, p0_l);
1273 tmp2 = __lsx_vadd_h(tmp2, q0_l);
1274 tmp2 = __lsx_vsub_h(tmp2, p7_l);
1275 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1276 p0_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1277 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1278 p1_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1279 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1280 p1_filter16, p0_filter16, p1_filter16);
1281 p0_filter16 = __lsx_vbitsel_v(p0_out, p0_filter16, flat2);
1282 p1_filter16 = __lsx_vbitsel_v(q0_out, p1_filter16, flat2);
1283 __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1284 dst += stride;
1285 __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1286 dst += stride;
1287
1288 /* calculation of q1 and q2 */
1289 tmp0 = __lsx_vsub_h(q7_l, q0_l);
1290 tmp0 = __lsx_vadd_h(tmp0, q1_l);
1291 tmp0 = __lsx_vsub_h(tmp0, p6_l);
1292 tmp2 = __lsx_vsub_h(q7_l, q1_l);
1293 tmp2 = __lsx_vadd_h(tmp2, q2_l);
1294 tmp2 = __lsx_vsub_h(tmp2, p5_l);
1295 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1296 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1297 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1298 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1299 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1300 p1_filter16, p0_filter16, p1_filter16);
1301 p0_filter16 = __lsx_vbitsel_v(q1_out, p0_filter16, flat2);
1302 p1_filter16 = __lsx_vbitsel_v(q2_out, p1_filter16, flat2);
1303 __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1304 dst += stride;
1305 __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1306 dst += stride;
1307
1308 /* calculation of q3 and q4 */
1309 tmp0 = __lsx_vsub_h(q7_l, q2_l);
1310 tmp0 = __lsx_vadd_h(tmp0, q3_l);
1311 tmp0 = __lsx_vsub_h(tmp0, p4_l);
1312 tmp2 = __lsx_vsub_h(q7_l, q3_l);
1313 tmp2 = __lsx_vadd_h(tmp2, q4_l);
1314 tmp2 = __lsx_vsub_h(tmp2, p3_l);
1315 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1316 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1317 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1318 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1319 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1320 p1_filter16, p0_filter16, p1_filter16);
1321 p0_filter16 = __lsx_vbitsel_v(q3, p0_filter16, flat2);
1322 p1_filter16 = __lsx_vbitsel_v(q4, p1_filter16, flat2);
1323 __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1324 dst += stride;
1325 __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1326 dst += stride;
1327
1328 /* calculation of q5 and q6 */
1329 tmp0 = __lsx_vsub_h(q7_l, q4_l);
1330 tmp0 = __lsx_vadd_h(tmp0, q5_l);
1331 tmp0 = __lsx_vsub_h(tmp0, p2_l);
1332 tmp2 = __lsx_vsub_h(q7_l, q5_l);
1333 tmp2 = __lsx_vadd_h(tmp2, q6_l);
1334 tmp2 = __lsx_vsub_h(tmp2, p1_l);
1335 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1336 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1337 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1338 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1339 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1340 p1_filter16, p0_filter16, p1_filter16);
1341 p0_filter16 = __lsx_vbitsel_v(q5, p0_filter16, flat2);
1342 p1_filter16 = __lsx_vbitsel_v(q6, p1_filter16, flat2);
1343 __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1344 dst += stride;
1345 __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1346 }
1347 }
1348 }
1349
ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1350 void ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride,
1351 int32_t b_limit_ptr,
1352 int32_t limit_ptr,
1353 int32_t thresh_ptr)
1354 {
1355 ptrdiff_t stride2 = stride << 1;
1356 ptrdiff_t stride3 = stride2 + stride;
1357 ptrdiff_t stride4 = stride2 << 1;
1358 uint8_t *dst_tmp1 = dst - 4;
1359 uint8_t *dst_tmp2 = dst_tmp1 + stride4;
1360 __m128i mask, hev, flat, limit, thresh, b_limit;
1361 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1362 __m128i vec0, vec1, vec2, vec3;
1363
1364 p3 = __lsx_vld(dst_tmp1, 0);
1365 DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, p2, p1);
1366 p0 = __lsx_vldx(dst_tmp1, stride3);
1367 q0 = __lsx_vld(dst_tmp2, 0);
1368 DUP2_ARG2(__lsx_vldx, dst_tmp2, stride, dst_tmp2, stride2, q1, q2);
1369 q3 = __lsx_vldx(dst_tmp2, stride3);
1370
1371 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1372 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1373 limit = __lsx_vreplgr2vr_b(limit_ptr);
1374
1375 LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3,
1376 p3, p2, p1, p0, q0, q1, q2, q3);
1377 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1378 hev, mask, flat);
1379 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1380 DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
1381 vec2 = __lsx_vilvl_h(vec1, vec0);
1382 vec3 = __lsx_vilvh_h(vec1, vec0);
1383
1384 dst -= 2;
1385 __lsx_vstelm_w(vec2, dst, 0, 0);
1386 __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1387 __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1388 __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1389 dst += stride4;
1390 __lsx_vstelm_w(vec3, dst, 0, 0);
1391 __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1392 __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1393 __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1394 }
1395
ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1396 void ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride,
1397 int32_t b_limit_ptr,
1398 int32_t limit_ptr,
1399 int32_t thresh_ptr)
1400 {
1401 ptrdiff_t stride2 = stride << 1;
1402 ptrdiff_t stride3 = stride2 + stride;
1403 ptrdiff_t stride4 = stride2 << 1;
1404 uint8_t *dst_tmp = dst - 4;
1405 __m128i mask, hev, flat;
1406 __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1407 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1408 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
1409 __m128i row8, row9, row10, row11, row12, row13, row14, row15;
1410 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1411
1412 row0 = __lsx_vld(dst_tmp, 0);
1413 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row1, row2);
1414 row3 = __lsx_vldx(dst_tmp, stride3);
1415 dst_tmp += stride4;
1416 row4 = __lsx_vld(dst_tmp, 0);
1417 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1418 row7 = __lsx_vldx(dst_tmp, stride3);
1419 dst_tmp += stride4;
1420 row8 = __lsx_vld(dst_tmp, 0);
1421 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row9, row10);
1422 row11 = __lsx_vldx(dst_tmp, stride3);
1423 dst_tmp += stride4;
1424 row12 = __lsx_vld(dst_tmp, 0);
1425 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1426 row15 = __lsx_vldx(dst_tmp, stride3);
1427
1428 LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
1429 row8, row9, row10, row11, row12, row13, row14, row15,
1430 p3, p2, p1, p0, q0, q1, q2, q3);
1431
1432 thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
1433 thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1434 thresh0 = __lsx_vilvl_d(thresh1, thresh0);
1435
1436 b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
1437 b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1438 b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
1439
1440 limit0 = __lsx_vreplgr2vr_b(limit_ptr);
1441 limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1442 limit0 = __lsx_vilvl_d(limit1, limit0);
1443
1444 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1445 hev, mask, flat);
1446 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1447 DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
1448 tmp2 = __lsx_vilvl_h(tmp1, tmp0);
1449 tmp3 = __lsx_vilvh_h(tmp1, tmp0);
1450 DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
1451 tmp4 = __lsx_vilvl_h(tmp1, tmp0);
1452 tmp5 = __lsx_vilvh_h(tmp1, tmp0);
1453
1454 dst -= 2;
1455 __lsx_vstelm_w(tmp2, dst, 0, 0);
1456 __lsx_vstelm_w(tmp2, dst + stride, 0, 1);
1457 __lsx_vstelm_w(tmp2, dst + stride2, 0, 2);
1458 __lsx_vstelm_w(tmp2, dst + stride3, 0, 3);
1459 dst += stride4;
1460 __lsx_vstelm_w(tmp3, dst, 0, 0);
1461 __lsx_vstelm_w(tmp3, dst + stride, 0, 1);
1462 __lsx_vstelm_w(tmp3, dst + stride2, 0, 2);
1463 __lsx_vstelm_w(tmp3, dst + stride3, 0, 3);
1464 dst += stride4;
1465 __lsx_vstelm_w(tmp4, dst, 0, 0);
1466 __lsx_vstelm_w(tmp4, dst + stride, 0, 1);
1467 __lsx_vstelm_w(tmp4, dst + stride2, 0, 2);
1468 __lsx_vstelm_w(tmp4, dst + stride3, 0, 3);
1469 dst += stride4;
1470 __lsx_vstelm_w(tmp5, dst, 0, 0);
1471 __lsx_vstelm_w(tmp5, dst + stride, 0, 1);
1472 __lsx_vstelm_w(tmp5, dst + stride2, 0, 2);
1473 __lsx_vstelm_w(tmp5, dst + stride3, 0, 3);
1474 }
1475
ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1476 void ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride,
1477 int32_t b_limit_ptr,
1478 int32_t limit_ptr,
1479 int32_t thresh_ptr)
1480 {
1481 ptrdiff_t stride2 = stride << 1;
1482 ptrdiff_t stride3 = stride2 + stride;
1483 ptrdiff_t stride4 = stride2 << 1;
1484 uint8_t *dst_tmp = dst - 4;
1485 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1486 __m128i p1_out, p0_out, q0_out, q1_out;
1487 __m128i flat, mask, hev, thresh, b_limit, limit;
1488 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1489 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1490 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1491 __m128i vec0, vec1, vec2, vec3, vec4;
1492 __m128i zero = __lsx_vldi(0);
1493
1494 /* load vector elements */
1495 p3 = __lsx_vld(dst_tmp, 0);
1496 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
1497 p0 = __lsx_vldx(dst_tmp, stride3);
1498 dst_tmp += stride4;
1499 q0 = __lsx_vld(dst_tmp, 0);
1500 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
1501 q3 = __lsx_vldx(dst_tmp, stride3);
1502
1503 LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3,
1504 p3, p2, p1, p0, q0, q1, q2, q3);
1505
1506 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1507 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1508 limit = __lsx_vreplgr2vr_b(limit_ptr);
1509
1510 /* mask and hev */
1511 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1512 hev, mask, flat);
1513 /* flat4 */
1514 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1515 /* filter4 */
1516 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1517 q1_out);
1518
1519 flat = __lsx_vilvl_d(zero, flat);
1520
1521 /* if flat is zero for all pixels, then no need to calculate other filter */
1522 if (__lsx_bz_v(flat)) {
1523 /* Store 4 pixels p1-_q1 */
1524 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1525 vec2 = __lsx_vilvl_h(vec1, vec0);
1526 vec3 = __lsx_vilvh_h(vec1, vec0);
1527
1528 dst -= 2;
1529 __lsx_vstelm_w(vec2, dst, 0, 0);
1530 __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1531 __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1532 __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1533 dst += stride4;
1534 __lsx_vstelm_w(vec3, dst, 0, 0);
1535 __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1536 __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1537 __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1538 } else {
1539 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1540 p3_l, p2_l, p1_l, p0_l);
1541 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1542 q0_l, q1_l, q2_l, q3_l);
1543 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1544 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1545 /* convert 16 bit output data into 8 bit */
1546 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
1547 p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l,
1548 q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
1549 q0_filt8_l);
1550 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
1551 q2_filt8_l, q1_filt8_l, q2_filt8_l);
1552
1553 /* store pixel values */
1554 p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1555 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1556 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1557 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1558 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1559 q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1560
1561 /* Store 6 pixels p2-_q2 */
1562 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1563 vec2 = __lsx_vilvl_h(vec1, vec0);
1564 vec3 = __lsx_vilvh_h(vec1, vec0);
1565 vec4 = __lsx_vilvl_b(q2, q1);
1566
1567 dst -= 3;
1568 __lsx_vstelm_w(vec2, dst, 0, 0);
1569 __lsx_vstelm_h(vec4, dst, 4, 0);
1570 dst += stride;
1571 __lsx_vstelm_w(vec2, dst, 0, 1);
1572 __lsx_vstelm_h(vec4, dst, 4, 1);
1573 dst += stride;
1574 __lsx_vstelm_w(vec2, dst, 0, 2);
1575 __lsx_vstelm_h(vec4, dst, 4, 2);
1576 dst += stride;
1577 __lsx_vstelm_w(vec2, dst, 0, 3);
1578 __lsx_vstelm_h(vec4, dst, 4, 3);
1579 dst += stride;
1580 __lsx_vstelm_w(vec3, dst, 0, 0);
1581 __lsx_vstelm_h(vec4, dst, 4, 4);
1582 dst += stride;
1583 __lsx_vstelm_w(vec3, dst, 0, 1);
1584 __lsx_vstelm_h(vec4, dst, 4, 5);
1585 dst += stride;
1586 __lsx_vstelm_w(vec3, dst, 0, 2);
1587 __lsx_vstelm_h(vec4, dst, 4, 6);
1588 dst += stride;
1589 __lsx_vstelm_w(vec3, dst, 0, 3);
1590 __lsx_vstelm_h(vec4, dst, 4, 7);
1591 }
1592 }
1593
ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1594 void ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride,
1595 int32_t b_limit_ptr,
1596 int32_t limit_ptr,
1597 int32_t thresh_ptr)
1598 {
1599 ptrdiff_t stride2 = stride << 1;
1600 ptrdiff_t stride3 = stride2 + stride;
1601 ptrdiff_t stride4 = stride2 << 1;
1602 uint8_t *dst_tmp = dst - 4;
1603 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1604 __m128i p1_out, p0_out, q0_out, q1_out;
1605 __m128i flat, mask, hev, thresh, b_limit, limit;
1606 __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1607 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1608 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1609 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1610 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1611 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1612 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1613 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1614 __m128i zero = __lsx_vldi(0);
1615
1616 p0 = __lsx_vld(dst_tmp, 0);
1617 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1618 p3 = __lsx_vldx(dst_tmp, stride3);
1619 dst_tmp += stride4;
1620 row4 = __lsx_vld(dst_tmp, 0);
1621 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1622 row7 = __lsx_vldx(dst_tmp, stride3);
1623 dst_tmp += stride4;
1624 q3 = __lsx_vld(dst_tmp, 0);
1625 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1626 q0 = __lsx_vldx(dst_tmp, stride3);
1627 dst_tmp += stride4;
1628 row12 = __lsx_vld(dst_tmp, 0);
1629 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1630 row15 = __lsx_vldx(dst_tmp, stride3);
1631
1632 /* transpose 16x8 matrix into 8x16 */
1633 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1634 q3, q2, q1, q0, row12, row13, row14, row15,
1635 p3, p2, p1, p0, q0, q1, q2, q3);
1636
1637 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1638 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1639 thresh = __lsx_vilvl_d(vec0, thresh);
1640
1641 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1642 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1643 b_limit = __lsx_vilvl_d(vec0, b_limit);
1644
1645 limit = __lsx_vreplgr2vr_b(limit_ptr);
1646 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1647 limit = __lsx_vilvl_d(vec0, limit);
1648
1649 /* mask and hev */
1650 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1651 hev, mask, flat);
1652 /* flat4 */
1653 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1654 /* filter4 */
1655 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1656 q1_out);
1657
1658 /* if flat is zero for all pixels, then no need to calculate other filter */
1659 if (__lsx_bz_v(flat)) {
1660 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1661 vec2 = __lsx_vilvl_h(vec1, vec0);
1662 vec3 = __lsx_vilvh_h(vec1, vec0);
1663 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1664 vec4 = __lsx_vilvl_h(vec1, vec0);
1665 vec5 = __lsx_vilvh_h(vec1, vec0);
1666
1667 dst -= 2;
1668 __lsx_vstelm_w(vec2, dst, 0, 0);
1669 __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1670 __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1671 __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1672 dst += stride4;
1673 __lsx_vstelm_w(vec3, dst, 0, 0);
1674 __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1675 __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1676 __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1677 dst += stride4;
1678 __lsx_vstelm_w(vec4, dst, 0, 0);
1679 __lsx_vstelm_w(vec4, dst + stride, 0, 1);
1680 __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
1681 __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
1682 dst += stride4;
1683 __lsx_vstelm_w(vec5, dst, 0, 0);
1684 __lsx_vstelm_w(vec5, dst + stride, 0, 1);
1685 __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
1686 __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
1687 } else {
1688 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1689 p3_l, p2_l, p1_l, p0_l);
1690 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1691 q0_l, q1_l, q2_l, q3_l);
1692 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1693 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1694
1695 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
1696 p3_h, p2_h, p1_h, p0_h);
1697 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
1698 q0_h, q1_h, q2_h, q3_h);
1699
1700 /* filter8 */
1701 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
1702 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
1703
1704 /* convert 16 bit output data into 8 bit */
1705 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
1706 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
1707 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1708 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
1709 q2_filt8_l, q1_filt8_l, q2_filt8_l);
1710
1711 /* store pixel values */
1712 p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1713 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1714 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1715 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1716 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1717 q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1718
1719 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1720 vec3 = __lsx_vilvl_h(vec1, vec0);
1721 vec4 = __lsx_vilvh_h(vec1, vec0);
1722 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
1723 vec6 = __lsx_vilvl_h(vec1, vec0);
1724 vec7 = __lsx_vilvh_h(vec1, vec0);
1725 vec2 = __lsx_vilvl_b(q2, q1);
1726 vec5 = __lsx_vilvh_b(q2, q1);
1727
1728 dst -= 3;
1729 __lsx_vstelm_w(vec3, dst, 0, 0);
1730 __lsx_vstelm_h(vec2, dst, 4, 0);
1731 dst += stride;
1732 __lsx_vstelm_w(vec3, dst, 0, 1);
1733 __lsx_vstelm_h(vec2, dst, 4, 1);
1734 dst += stride;
1735 __lsx_vstelm_w(vec3, dst, 0, 2);
1736 __lsx_vstelm_h(vec2, dst, 4, 2);
1737 dst += stride;
1738 __lsx_vstelm_w(vec3, dst, 0, 3);
1739 __lsx_vstelm_h(vec2, dst, 4, 3);
1740 dst += stride;
1741 __lsx_vstelm_w(vec4, dst, 0, 0);
1742 __lsx_vstelm_h(vec2, dst, 4, 4);
1743 dst += stride;
1744 __lsx_vstelm_w(vec4, dst, 0, 1);
1745 __lsx_vstelm_h(vec2, dst, 4, 5);
1746 dst += stride;
1747 __lsx_vstelm_w(vec4, dst, 0, 2);
1748 __lsx_vstelm_h(vec2, dst, 4, 6);
1749 dst += stride;
1750 __lsx_vstelm_w(vec4, dst, 0, 3);
1751 __lsx_vstelm_h(vec2, dst, 4, 7);
1752 dst += stride;
1753 __lsx_vstelm_w(vec6, dst, 0, 0);
1754 __lsx_vstelm_h(vec5, dst, 4, 0);
1755 dst += stride;
1756 __lsx_vstelm_w(vec6, dst, 0, 1);
1757 __lsx_vstelm_h(vec5, dst, 4, 1);
1758 dst += stride;
1759 __lsx_vstelm_w(vec6, dst, 0, 2);
1760 __lsx_vstelm_h(vec5, dst, 4, 2);
1761 dst += stride;
1762 __lsx_vstelm_w(vec6, dst, 0, 3);
1763 __lsx_vstelm_h(vec5, dst, 4, 3);
1764 dst += stride;
1765 __lsx_vstelm_w(vec7, dst, 0, 0);
1766 __lsx_vstelm_h(vec5, dst, 4, 4);
1767 dst += stride;
1768 __lsx_vstelm_w(vec7, dst, 0, 1);
1769 __lsx_vstelm_h(vec5, dst, 4, 5);
1770 dst += stride;
1771 __lsx_vstelm_w(vec7, dst, 0, 2);
1772 __lsx_vstelm_h(vec5, dst, 4, 6);
1773 dst += stride;
1774 __lsx_vstelm_w(vec7, dst, 0, 3);
1775 __lsx_vstelm_h(vec5, dst, 4, 7);
1776 }
1777 }
1778
ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1779 void ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride,
1780 int32_t b_limit_ptr,
1781 int32_t limit_ptr,
1782 int32_t thresh_ptr)
1783 {
1784 ptrdiff_t stride2 = stride << 1;
1785 ptrdiff_t stride3 = stride2 + stride;
1786 ptrdiff_t stride4 = stride2 << 1;
1787 uint8_t *dst_tmp = dst - 4;
1788 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1789 __m128i p1_out, p0_out, q0_out, q1_out;
1790 __m128i flat, mask, hev, thresh, b_limit, limit;
1791 __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1792 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1793 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1794 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1795 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1796 __m128i zero = __lsx_vldi(0);
1797
1798 p0 = __lsx_vld(dst_tmp, 0);
1799 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1800 p3 = __lsx_vldx(dst_tmp, stride3);
1801 dst_tmp += stride4;
1802 row4 = __lsx_vld(dst_tmp, 0);
1803 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1804 row7 = __lsx_vldx(dst_tmp, stride3);
1805 dst_tmp += stride4;
1806 q3 = __lsx_vld(dst_tmp, 0);
1807 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1808 q0 = __lsx_vldx(dst_tmp, stride3);
1809 dst_tmp += stride4;
1810 row12 = __lsx_vld(dst_tmp, 0);
1811 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1812 row15 = __lsx_vldx(dst_tmp, stride3);
1813
1814 /* transpose 16x8 matrix into 8x16 */
1815 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1816 q3, q2, q1, q0, row12, row13, row14, row15,
1817 p3, p2, p1, p0, q0, q1, q2, q3);
1818
1819 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1820 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1821 thresh = __lsx_vilvl_d(vec0, thresh);
1822
1823 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1824 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1825 b_limit = __lsx_vilvl_d(vec0, b_limit);
1826
1827 limit = __lsx_vreplgr2vr_b(limit_ptr);
1828 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1829 limit = __lsx_vilvl_d(vec0, limit);
1830
1831 /* mask and hev */
1832 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1833 hev, mask, flat);
1834 /* flat4 */
1835 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1836 /* filter4 */
1837 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1838 q1_out);
1839
1840 flat = __lsx_vilvl_d(zero, flat);
1841
1842 /* if flat is zero for all pixels, then no need to calculate other filter */
1843 if (__lsx_bz_v(flat)) {
1844 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1845 vec2 = __lsx_vilvl_h(vec1, vec0);
1846 vec3 = __lsx_vilvh_h(vec1, vec0);
1847 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1848 vec4 = __lsx_vilvl_h(vec1, vec0);
1849 vec5 = __lsx_vilvh_h(vec1, vec0);
1850
1851 dst -= 2;
1852 __lsx_vstelm_w(vec2, dst, 0, 0);
1853 __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1854 __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1855 __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1856 dst += stride4;
1857 __lsx_vstelm_w(vec3, dst, 0, 0);
1858 __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1859 __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1860 __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1861 dst += stride4;
1862 __lsx_vstelm_w(vec4, dst, 0, 0);
1863 __lsx_vstelm_w(vec4, dst + stride, 0, 1);
1864 __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
1865 __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
1866 dst += stride4;
1867 __lsx_vstelm_w(vec5, dst, 0, 0);
1868 __lsx_vstelm_w(vec5, dst + stride, 0, 1);
1869 __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
1870 __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
1871 } else {
1872 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1873 p3_l, p2_l, p1_l, p0_l);
1874 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1875 q0_l, q1_l, q2_l, q3_l);
1876 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1877 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1878
1879 /* convert 16 bit output data into 8 bit */
1880 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1881 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
1882 p1_filt8_l, p0_filt8_l, q0_filt8_l);
1883 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1884 q1_filt8_l, q2_filt8_l);
1885
1886 /* store pixel values */
1887 p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1888 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1889 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1890 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1891 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1892 q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1893
1894 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1895 vec3 = __lsx_vilvl_h(vec1, vec0);
1896 vec4 = __lsx_vilvh_h(vec1, vec0);
1897 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
1898 vec6 = __lsx_vilvl_h(vec1, vec0);
1899 vec7 = __lsx_vilvh_h(vec1, vec0);
1900 vec2 = __lsx_vilvl_b(q2, q1);
1901 vec5 = __lsx_vilvh_b(q2, q1);
1902
1903 dst -= 3;
1904 __lsx_vstelm_w(vec3, dst, 0, 0);
1905 __lsx_vstelm_h(vec2, dst, 4, 0);
1906 dst += stride;
1907 __lsx_vstelm_w(vec3, dst, 0, 1);
1908 __lsx_vstelm_h(vec2, dst, 4, 1);
1909 dst += stride;
1910 __lsx_vstelm_w(vec3, dst, 0, 2);
1911 __lsx_vstelm_h(vec2, dst, 4, 2);
1912 dst += stride;
1913 __lsx_vstelm_w(vec3, dst, 0, 3);
1914 __lsx_vstelm_h(vec2, dst, 4, 3);
1915 dst += stride;
1916 __lsx_vstelm_w(vec4, dst, 0, 0);
1917 __lsx_vstelm_h(vec2, dst, 4, 4);
1918 dst += stride;
1919 __lsx_vstelm_w(vec4, dst, 0, 1);
1920 __lsx_vstelm_h(vec2, dst, 4, 5);
1921 dst += stride;
1922 __lsx_vstelm_w(vec4, dst, 0, 2);
1923 __lsx_vstelm_h(vec2, dst, 4, 6);
1924 dst += stride;
1925 __lsx_vstelm_w(vec4, dst, 0, 3);
1926 __lsx_vstelm_h(vec2, dst, 4, 7);
1927 dst += stride;
1928 __lsx_vstelm_w(vec6, dst, 0, 0);
1929 __lsx_vstelm_h(vec5, dst, 4, 0);
1930 dst += stride;
1931 __lsx_vstelm_w(vec6, dst, 0, 1);
1932 __lsx_vstelm_h(vec5, dst, 4, 1);
1933 dst += stride;
1934 __lsx_vstelm_w(vec6, dst, 0, 2);
1935 __lsx_vstelm_h(vec5, dst, 4, 2);
1936 dst += stride;
1937 __lsx_vstelm_w(vec6, dst, 0, 3);
1938 __lsx_vstelm_h(vec5, dst, 4, 3);
1939 dst += stride;
1940 __lsx_vstelm_w(vec7, dst, 0, 0);
1941 __lsx_vstelm_h(vec5, dst, 4, 4);
1942 dst += stride;
1943 __lsx_vstelm_w(vec7, dst, 0, 1);
1944 __lsx_vstelm_h(vec5, dst, 4, 5);
1945 dst += stride;
1946 __lsx_vstelm_w(vec7, dst, 0, 2);
1947 __lsx_vstelm_h(vec5, dst, 4, 6);
1948 dst += stride;
1949 __lsx_vstelm_w(vec7, dst, 0, 3);
1950 __lsx_vstelm_h(vec5, dst, 4, 7);
1951 }
1952 }
1953
ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)1954 void ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride,
1955 int32_t b_limit_ptr,
1956 int32_t limit_ptr,
1957 int32_t thresh_ptr)
1958 {
1959 ptrdiff_t stride2 = stride << 1;
1960 ptrdiff_t stride3 = stride2 + stride;
1961 ptrdiff_t stride4 = stride2 << 1;
1962 uint8_t *dst_tmp = dst - 4;
1963 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1964 __m128i p1_out, p0_out, q0_out, q1_out;
1965 __m128i flat, mask, hev, thresh, b_limit, limit;
1966 __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1967 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1968 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1969 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1970 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1971 __m128i zero = __lsx_vldi(0);
1972
1973 p0 = __lsx_vld(dst_tmp, 0);
1974 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1975 p3 = __lsx_vldx(dst_tmp, stride3);
1976 dst_tmp += stride4;
1977 row4 = __lsx_vld(dst_tmp, 0);
1978 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1979 row7 = __lsx_vldx(dst_tmp, stride3);
1980 dst_tmp += stride4;
1981 q3 = __lsx_vld(dst_tmp, 0);
1982 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1983 q0 = __lsx_vldx(dst_tmp, stride3);
1984 dst_tmp += stride4;
1985 row12 = __lsx_vld(dst_tmp, 0);
1986 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1987 row15 = __lsx_vldx(dst_tmp, stride3);
1988
1989 /* transpose 16x8 matrix into 8x16 */
1990 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1991 q3, q2, q1, q0, row12, row13, row14, row15,
1992 p3, p2, p1, p0, q0, q1, q2, q3);
1993
1994 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1995 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1996 thresh = __lsx_vilvl_d(vec0, thresh);
1997
1998 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1999 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
2000 b_limit = __lsx_vilvl_d(vec0, b_limit);
2001
2002 limit = __lsx_vreplgr2vr_b(limit_ptr);
2003 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
2004 limit = __lsx_vilvl_d(vec0, limit);
2005
2006 /* mask and hev */
2007 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2008 hev, mask, flat);
2009 /* flat4 */
2010 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2011 /* filter4 */
2012 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2013 q1_out);
2014
2015 flat = __lsx_vilvh_d(flat, zero);
2016
2017 /* if flat is zero for all pixels, then no need to calculate other filter */
2018 if (__lsx_bz_v(flat)) {
2019 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2020 vec2 = __lsx_vilvl_h(vec1, vec0);
2021 vec3 = __lsx_vilvh_h(vec1, vec0);
2022 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2023 vec4 = __lsx_vilvl_h(vec1, vec0);
2024 vec5 = __lsx_vilvh_h(vec1, vec0);
2025
2026 dst -= 2;
2027 __lsx_vstelm_w(vec2, dst, 0, 0);
2028 __lsx_vstelm_w(vec2, dst + stride, 0, 1);
2029 __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
2030 __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
2031 dst += stride4;
2032 __lsx_vstelm_w(vec3, dst, 0, 0);
2033 __lsx_vstelm_w(vec3, dst + stride, 0, 1);
2034 __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
2035 __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
2036 dst += stride4;
2037 __lsx_vstelm_w(vec4, dst, 0, 0);
2038 __lsx_vstelm_w(vec4, dst + stride, 0, 1);
2039 __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
2040 __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
2041 dst += stride4;
2042 __lsx_vstelm_w(vec5, dst, 0, 0);
2043 __lsx_vstelm_w(vec5, dst + stride, 0, 1);
2044 __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
2045 __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
2046 } else {
2047 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
2048 p3_h, p2_h, p1_h, p0_h);
2049 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
2050 q0_h, q1_h, q2_h, q3_h);
2051
2052 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2053 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2054
2055 /* convert 16 bit output data into 8 bit */
2056 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
2057 p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
2058 p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
2059 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
2060 q2_filt8_h, q1_filt8_h, q2_filt8_h);
2061
2062 /* store pixel values */
2063 p2 = __lsx_vbitsel_v(p2, p2_filt8_h, flat);
2064 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat);
2065 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat);
2066 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat);
2067 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat);
2068 q2 = __lsx_vbitsel_v(q2, q2_filt8_h, flat);
2069
2070 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2071 vec3 = __lsx_vilvl_h(vec1, vec0);
2072 vec4 = __lsx_vilvh_h(vec1, vec0);
2073 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
2074 vec6 = __lsx_vilvl_h(vec1, vec0);
2075 vec7 = __lsx_vilvh_h(vec1, vec0);
2076 vec2 = __lsx_vilvl_b(q2, q1);
2077 vec5 = __lsx_vilvh_b(q2, q1);
2078
2079 dst -= 3;
2080 __lsx_vstelm_w(vec3, dst, 0, 0);
2081 __lsx_vstelm_h(vec2, dst, 4, 0);
2082 dst += stride;
2083 __lsx_vstelm_w(vec3, dst, 0, 1);
2084 __lsx_vstelm_h(vec2, dst, 4, 1);
2085 dst += stride;
2086 __lsx_vstelm_w(vec3, dst, 0, 2);
2087 __lsx_vstelm_h(vec2, dst, 4, 2);
2088 dst += stride;
2089 __lsx_vstelm_w(vec3, dst, 0, 3);
2090 __lsx_vstelm_h(vec2, dst, 4, 3);
2091 dst += stride;
2092 __lsx_vstelm_w(vec4, dst, 0, 0);
2093 __lsx_vstelm_h(vec2, dst, 4, 4);
2094 dst += stride;
2095 __lsx_vstelm_w(vec4, dst, 0, 1);
2096 __lsx_vstelm_h(vec2, dst, 4, 5);
2097 dst += stride;
2098 __lsx_vstelm_w(vec4, dst, 0, 2);
2099 __lsx_vstelm_h(vec2, dst, 4, 6);
2100 dst += stride;
2101 __lsx_vstelm_w(vec4, dst, 0, 3);
2102 __lsx_vstelm_h(vec2, dst, 4, 7);
2103 dst += stride;
2104 __lsx_vstelm_w(vec6, dst, 0, 0);
2105 __lsx_vstelm_h(vec5, dst, 4, 0);
2106 dst += stride;
2107 __lsx_vstelm_w(vec6, dst, 0, 1);
2108 __lsx_vstelm_h(vec5, dst, 4, 1);
2109 dst += stride;
2110 __lsx_vstelm_w(vec6, dst, 0, 2);
2111 __lsx_vstelm_h(vec5, dst, 4, 2);
2112 dst += stride;
2113 __lsx_vstelm_w(vec6, dst, 0, 3);
2114 __lsx_vstelm_h(vec5, dst, 4, 3);
2115 dst += stride;
2116 __lsx_vstelm_w(vec7, dst, 0, 0);
2117 __lsx_vstelm_h(vec5, dst, 4, 4);
2118 dst += stride;
2119 __lsx_vstelm_w(vec7, dst, 0, 1);
2120 __lsx_vstelm_h(vec5, dst, 4, 5);
2121 dst += stride;
2122 __lsx_vstelm_w(vec7, dst, 0, 2);
2123 __lsx_vstelm_h(vec5, dst, 4, 6);
2124 dst += stride;
2125 __lsx_vstelm_w(vec7, dst, 0, 3);
2126 __lsx_vstelm_h(vec5, dst, 4, 7);
2127 }
2128 }
2129
vp9_transpose_16x8_to_8x16(uint8_t *input, ptrdiff_t in_pitch, uint8_t *output)2130 static void vp9_transpose_16x8_to_8x16(uint8_t *input, ptrdiff_t in_pitch,
2131 uint8_t *output)
2132 {
2133 __m128i p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
2134 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2135 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2136 ptrdiff_t in_pitch2 = in_pitch << 1;
2137 ptrdiff_t in_pitch3 = in_pitch2 + in_pitch;
2138 ptrdiff_t in_pitch4 = in_pitch2 << 1;
2139
2140 LSX_LD_8(input, in_pitch, in_pitch2, in_pitch3, in_pitch4,
2141 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
2142 /* 8x8 transpose */
2143 LSX_TRANSPOSE8x8_B(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
2144 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
2145 /* 8x8 transpose */
2146 DUP4_ARG2(__lsx_vilvh_b, p5_org, p7_org, p4_org, p6_org, p1_org,
2147 p3_org, p0_org, p2_org, tmp0, tmp1, tmp2, tmp3);
2148 DUP2_ARG2(__lsx_vilvl_b, tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
2149 DUP2_ARG2(__lsx_vilvh_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
2150 DUP2_ARG2(__lsx_vilvl_w, tmp6, tmp4, tmp7, tmp5, q0, q4);
2151 DUP2_ARG2(__lsx_vilvh_w, tmp6, tmp4, tmp7, tmp5, q2, q6);
2152 DUP4_ARG2(__lsx_vbsrl_v, q0, 8, q2, 8, q4, 8, q6, 8, q1, q3, q5, q7);
2153
2154 __lsx_vst(p7, output, 0);
2155 __lsx_vst(p6, output, 16);
2156 __lsx_vst(p5, output, 32);
2157 __lsx_vst(p4, output, 48);
2158 __lsx_vst(p3, output, 64);
2159 __lsx_vst(p2, output, 80);
2160 __lsx_vst(p1, output, 96);
2161 __lsx_vst(p0, output, 112);
2162 __lsx_vst(q0, output, 128);
2163 __lsx_vst(q1, output, 144);
2164 __lsx_vst(q2, output, 160);
2165 __lsx_vst(q3, output, 176);
2166 __lsx_vst(q4, output, 192);
2167 __lsx_vst(q5, output, 208);
2168 __lsx_vst(q6, output, 224);
2169 __lsx_vst(q7, output, 240);
2170 }
2171
vp9_transpose_8x16_to_16x8(uint8_t *input, uint8_t *output, ptrdiff_t out_pitch)2172 static void vp9_transpose_8x16_to_16x8(uint8_t *input, uint8_t *output,
2173 ptrdiff_t out_pitch)
2174 {
2175 __m128i p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
2176 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2177 ptrdiff_t out_pitch2 = out_pitch << 1;
2178 ptrdiff_t out_pitch3 = out_pitch2 + out_pitch;
2179 ptrdiff_t out_pitch4 = out_pitch2 << 1;
2180
2181 DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48,
2182 p7, p6, p5, p4);
2183 DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112,
2184 p3, p2, p1, p0);
2185 DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176,
2186 q0, q1, q2, q3);
2187 DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240,
2188 q4, q5, q6, q7);
2189 LSX_TRANSPOSE16x8_B(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
2190 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
2191 LSX_ST_8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o,
2192 output, out_pitch, out_pitch2, out_pitch3, out_pitch4);
2193 }
2194
vp9_transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output, int32_t out_stride)2195 static void vp9_transpose_16x16(uint8_t *input, int32_t in_stride,
2196 uint8_t *output, int32_t out_stride)
2197 {
2198 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
2199 __m128i row8, row9, row10, row11, row12, row13, row14, row15;
2200 __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
2201 __m128i tmp2, tmp3;
2202 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2203 int32_t in_stride2 = in_stride << 1;
2204 int32_t in_stride3 = in_stride2 + in_stride;
2205 int32_t in_stride4 = in_stride2 << 1;
2206 int32_t out_stride2 = out_stride << 1;
2207 int32_t out_stride3 = out_stride2 + out_stride;
2208 int32_t out_stride4 = out_stride2 << 1;
2209
2210 LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4,
2211 row0, row1, row2, row3, row4, row5, row6, row7);
2212 input += in_stride4;
2213 LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4,
2214 row8, row9, row10, row11, row12, row13, row14, row15);
2215
2216 LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
2217 row8, row9, row10, row11, row12, row13, row14, row15,
2218 p7, p6, p5, p4, p3, p2, p1, p0);
2219
2220 /* transpose 16x8 matrix into 8x16 */
2221 /* total 8 intermediate register and 32 instructions */
2222 q7 = __lsx_vpackod_d(row8, row0);
2223 q6 = __lsx_vpackod_d(row9, row1);
2224 q5 = __lsx_vpackod_d(row10, row2);
2225 q4 = __lsx_vpackod_d(row11, row3);
2226 q3 = __lsx_vpackod_d(row12, row4);
2227 q2 = __lsx_vpackod_d(row13, row5);
2228 q1 = __lsx_vpackod_d(row14, row6);
2229 q0 = __lsx_vpackod_d(row15, row7);
2230
2231 DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
2232 DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
2233
2234 DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
2235 DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
2236
2237 DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
2238 q0 = __lsx_vpackev_w(tmp3, tmp2);
2239 q4 = __lsx_vpackod_w(tmp3, tmp2);
2240
2241 tmp2 = __lsx_vpackod_h(tmp1, tmp0);
2242 tmp3 = __lsx_vpackod_h(q7, q5);
2243 q2 = __lsx_vpackev_w(tmp3, tmp2);
2244 q6 = __lsx_vpackod_w(tmp3, tmp2);
2245
2246 DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
2247 q1 = __lsx_vpackev_w(tmp3, tmp2);
2248 q5 = __lsx_vpackod_w(tmp3, tmp2);
2249
2250 tmp2 = __lsx_vpackod_h(tmp5, tmp4);
2251 tmp3 = __lsx_vpackod_h(tmp7, tmp6);
2252 q3 = __lsx_vpackev_w(tmp3, tmp2);
2253 q7 = __lsx_vpackod_w(tmp3, tmp2);
2254
2255 LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride,
2256 out_stride2, out_stride3, out_stride4);
2257 output += out_stride4;
2258 LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride,
2259 out_stride2, out_stride3, out_stride4);
2260 }
2261
vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)2262 static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
2263 uint8_t *src_org, int32_t pitch_org,
2264 int32_t b_limit_ptr,
2265 int32_t limit_ptr,
2266 int32_t thresh_ptr)
2267 {
2268 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
2269 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2270 __m128i flat, mask, hev, thresh, b_limit, limit;
2271 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2272 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2273 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2274 __m128i vec0, vec1, vec2, vec3;
2275 __m128i zero = __lsx_vldi(0);
2276
2277 /* load vector elements */
2278 DUP4_ARG2(__lsx_vld, src, -64, src, -48, src, -32, src, -16,
2279 p3, p2, p1, p0);
2280 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, q0, q1, q2, q3);
2281
2282 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
2283 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2284 limit = __lsx_vreplgr2vr_b(limit_ptr);
2285
2286 /* mask and hev */
2287 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2288 hev, mask, flat);
2289 /* flat4 */
2290 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2291 /* filter4 */
2292 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2293 q1_out);
2294
2295 flat = __lsx_vilvl_d(zero, flat);
2296
2297 /* if flat is zero for all pixels, then no need to calculate other filter */
2298 if (__lsx_bz_v(flat)) {
2299 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2300 vec2 = __lsx_vilvl_h(vec1, vec0);
2301 vec3 = __lsx_vilvh_h(vec1, vec0);
2302
2303 src_org -= 2;
2304 __lsx_vstelm_w(vec2, src_org, 0, 0);
2305 src_org += pitch_org;
2306 __lsx_vstelm_w(vec2, src_org, 0, 1);
2307 src_org += pitch_org;
2308 __lsx_vstelm_w(vec2, src_org, 0, 2);
2309 src_org += pitch_org;
2310 __lsx_vstelm_w(vec2, src_org, 0, 3);
2311 src_org += pitch_org;
2312 __lsx_vstelm_w(vec3, src_org, 0, 0);
2313 src_org += pitch_org;
2314 __lsx_vstelm_w(vec3, src_org, 0, 1);
2315 src_org += pitch_org;
2316 __lsx_vstelm_w(vec3, src_org, 0, 2);
2317 src_org += pitch_org;
2318 __lsx_vstelm_w(vec3, src_org, 0, 3);
2319 return 1;
2320 } else {
2321 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
2322 p3_l, p2_l, p1_l, p0_l);
2323 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
2324 q0_l, q1_l, q2_l, q3_l);
2325 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2326 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2327
2328 /* convert 16 bit output data into 8 bit */
2329 p2_l = __lsx_vpickev_b(p2_filt8_l, p2_filt8_l);
2330 p1_l = __lsx_vpickev_b(p1_filt8_l, p1_filt8_l);
2331 p0_l = __lsx_vpickev_b(p0_filt8_l, p0_filt8_l);
2332 q0_l = __lsx_vpickev_b(q0_filt8_l, q0_filt8_l);
2333 q1_l = __lsx_vpickev_b(q1_filt8_l, q1_filt8_l);
2334 q2_l = __lsx_vpickev_b(q2_filt8_l, q2_filt8_l);
2335
2336 /* store pixel values */
2337 p2_out = __lsx_vbitsel_v(p2, p2_l, flat);
2338 p1_out = __lsx_vbitsel_v(p1_out, p1_l, flat);
2339 p0_out = __lsx_vbitsel_v(p0_out, p0_l, flat);
2340 q0_out = __lsx_vbitsel_v(q0_out, q0_l, flat);
2341 q1_out = __lsx_vbitsel_v(q1_out, q1_l, flat);
2342 q2_out = __lsx_vbitsel_v(q2, q2_l, flat);
2343
2344 __lsx_vst(p2_out, filter48, 0);
2345 __lsx_vst(p1_out, filter48, 16);
2346 __lsx_vst(p0_out, filter48, 32);
2347 __lsx_vst(q0_out, filter48, 48);
2348 __lsx_vst(q1_out, filter48, 64);
2349 __lsx_vst(q2_out, filter48, 80);
2350 __lsx_vst(flat, filter48, 96);
2351
2352 return 0;
2353 }
2354 }
2355
vp9_vt_lpf_t16_8w(uint8_t *dst, uint8_t *dst_org, ptrdiff_t stride, uint8_t *filter48)2356 static int32_t vp9_vt_lpf_t16_8w(uint8_t *dst, uint8_t *dst_org,
2357 ptrdiff_t stride,
2358 uint8_t *filter48)
2359 {
2360 __m128i zero = __lsx_vldi(0);
2361 __m128i filter8, flat, flat2;
2362 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2363 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2364 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2365 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2366 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2367 v8u16 tmp0_l, tmp1_l;
2368 __m128i out_l;
2369 uint8_t *dst_tmp = dst - 128;
2370
2371 /* load vector elements */
2372 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2373 dst_tmp, 48, p7, p6, p5, p4);
2374 DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2375 dst_tmp, 112, p3, p2, p1, p0);
2376 DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2377 DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
2378
2379 flat = __lsx_vld(filter48, 96);
2380
2381
2382 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2383
2384 /* if flat2 is zero for all pixels, then no need to calculate other filter */
2385 if (__lsx_bz_v(flat2)) {
2386 __m128i vec0, vec1, vec2, vec3, vec4;
2387
2388 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2389 filter48, 48, p2, p1, p0, q0);
2390 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
2391
2392 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2393 vec3 = __lsx_vilvl_h(vec1, vec0);
2394 vec4 = __lsx_vilvh_h(vec1, vec0);
2395 vec2 = __lsx_vilvl_b(q2, q1);
2396
2397 dst_org -= 3;
2398 __lsx_vstelm_w(vec3, dst_org, 0, 0);
2399 __lsx_vstelm_h(vec2, dst_org, 4, 0);
2400 dst_org += stride;
2401 __lsx_vstelm_w(vec3, dst_org, 0, 1);
2402 __lsx_vstelm_h(vec2, dst_org, 4, 1);
2403 dst_org += stride;
2404 __lsx_vstelm_w(vec3, dst_org, 0, 2);
2405 __lsx_vstelm_h(vec2, dst_org, 4, 2);
2406 dst_org += stride;
2407 __lsx_vstelm_w(vec3, dst_org, 0, 3);
2408 __lsx_vstelm_h(vec2, dst_org, 4, 3);
2409 dst_org += stride;
2410 __lsx_vstelm_w(vec4, dst_org, 0, 0);
2411 __lsx_vstelm_h(vec2, dst_org, 4, 4);
2412 dst_org += stride;
2413 __lsx_vstelm_w(vec4, dst_org, 0, 1);
2414 __lsx_vstelm_h(vec2, dst_org, 4, 5);
2415 dst_org += stride;
2416 __lsx_vstelm_w(vec4, dst_org, 0, 2);
2417 __lsx_vstelm_h(vec2, dst_org, 4, 6);
2418 dst_org += stride;
2419 __lsx_vstelm_w(vec4, dst_org, 0, 3);
2420 __lsx_vstelm_h(vec2, dst_org, 4, 7);
2421 return 1;
2422 } else {
2423 dst -= 7 * 16;
2424
2425 p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
2426 p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
2427 p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
2428 p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
2429 p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
2430 p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
2431 p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
2432 p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
2433 q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
2434
2435 tmp0_l = p7_l_in << 3;
2436 tmp0_l -= p7_l_in;
2437 tmp0_l += p6_l_in;
2438 tmp0_l += q0_l_in;
2439 tmp1_l = p6_l_in + p5_l_in;
2440 tmp1_l += p4_l_in;
2441 tmp1_l += p3_l_in;
2442 tmp1_l += p2_l_in;
2443 tmp1_l += p1_l_in;
2444 tmp1_l += p0_l_in;
2445 tmp1_l += tmp0_l;
2446
2447 out_l =__lsx_vsrari_h((__m128i)tmp1_l, 4);
2448 out_l =__lsx_vpickev_b(out_l, out_l);
2449 p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2450 __lsx_vstelm_d(p6, dst, 0, 0);
2451 dst += 16;
2452
2453 /* p5 */
2454 q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
2455 tmp0_l = p5_l_in - p6_l_in;
2456 tmp0_l += q1_l_in;
2457 tmp0_l -= p7_l_in;
2458 tmp1_l += tmp0_l;
2459 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2460 out_l = __lsx_vpickev_b(out_l, out_l);
2461 p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2462 __lsx_vstelm_d(p5, dst, 0, 0);
2463 dst += 16;
2464
2465 /* p4 */
2466 q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
2467 tmp0_l = p4_l_in - p5_l_in;
2468 tmp0_l += q2_l_in;
2469 tmp0_l -= p7_l_in;
2470 tmp1_l += tmp0_l;
2471 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2472 out_l = __lsx_vpickev_b(out_l, out_l);
2473 p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2474 __lsx_vstelm_d(p4, dst, 0, 0);
2475 dst += 16;
2476
2477 /* p3 */
2478 q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
2479 tmp0_l = p3_l_in - p4_l_in;
2480 tmp0_l += q3_l_in;
2481 tmp0_l -= p7_l_in;
2482 tmp1_l += tmp0_l;
2483 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2484 out_l = __lsx_vpickev_b(out_l, out_l);
2485 p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2486 __lsx_vstelm_d(p3, dst, 0, 0);
2487 dst += 16;
2488
2489 /* p2 */
2490 q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
2491 filter8 = __lsx_vld(filter48, 0);
2492 tmp0_l = p2_l_in - p3_l_in;
2493 tmp0_l += q4_l_in;
2494 tmp0_l -= p7_l_in;
2495 tmp1_l += tmp0_l;
2496 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2497 out_l = __lsx_vpickev_b(out_l, out_l);
2498 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2499 __lsx_vstelm_d(filter8, dst, 0, 0);
2500 dst += 16;
2501
2502 /* p1 */
2503 q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
2504 filter8 = __lsx_vld(filter48, 16);
2505 tmp0_l = p1_l_in - p2_l_in;
2506 tmp0_l += q5_l_in;
2507 tmp0_l -= p7_l_in;
2508 tmp1_l += tmp0_l;
2509 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2510 out_l = __lsx_vpickev_b(out_l, out_l);
2511 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2512 __lsx_vstelm_d(filter8, dst, 0, 0);
2513 dst += 16;
2514
2515 /* p0 */
2516 q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
2517 filter8 = __lsx_vld(filter48, 32);
2518 tmp0_l = p0_l_in - p1_l_in;
2519 tmp0_l += q6_l_in;
2520 tmp0_l -= p7_l_in;
2521 tmp1_l += tmp0_l;
2522 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2523 out_l = __lsx_vpickev_b(out_l, out_l);
2524 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2525 __lsx_vstelm_d(filter8, dst, 0, 0);
2526 dst += 16;
2527
2528 /* q0 */
2529 q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
2530 filter8 = __lsx_vld(filter48, 48);
2531 tmp0_l = q7_l_in - p0_l_in;
2532 tmp0_l += q0_l_in;
2533 tmp0_l -= p7_l_in;
2534 tmp1_l += tmp0_l;
2535 out_l = __lsx_vsrari_h((v8i16) tmp1_l, 4);
2536 out_l = __lsx_vpickev_b(out_l, out_l);
2537 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2538 __lsx_vstelm_d(filter8, dst, 0, 0);
2539 dst += 16;
2540
2541 /* q1 */
2542 filter8 = __lsx_vld(filter48, 64);
2543 tmp0_l = q7_l_in - q0_l_in;
2544 tmp0_l += q1_l_in;
2545 tmp0_l -= p6_l_in;
2546 tmp1_l += tmp0_l;
2547 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2548 out_l = __lsx_vpickev_b(out_l, out_l);
2549 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2550 __lsx_vstelm_d(filter8, dst, 0, 0);
2551 dst += 16;
2552
2553 /* q2 */
2554 filter8 = __lsx_vld(filter48, 80);
2555 tmp0_l = q7_l_in - q1_l_in;
2556 tmp0_l += q2_l_in;
2557 tmp0_l -= p5_l_in;
2558 tmp1_l += tmp0_l;
2559 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2560 out_l = __lsx_vpickev_b(out_l, out_l);
2561 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2562 __lsx_vstelm_d(filter8, dst, 0, 0);
2563 dst += 16;
2564
2565 /* q3 */
2566 tmp0_l = q7_l_in - q2_l_in;
2567 tmp0_l += q3_l_in;
2568 tmp0_l -= p4_l_in;
2569 tmp1_l += tmp0_l;
2570 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2571 out_l = __lsx_vpickev_b(out_l, out_l);
2572 q3 = __lsx_vbitsel_v(q3, out_l, flat2);
2573 __lsx_vstelm_d(q3, dst, 0, 0);
2574 dst += 16;
2575
2576 /* q4 */
2577 tmp0_l = q7_l_in - q3_l_in;
2578 tmp0_l += q4_l_in;
2579 tmp0_l -= p3_l_in;
2580 tmp1_l += tmp0_l;
2581 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2582 out_l = __lsx_vpickev_b(out_l, out_l);
2583 q4 = __lsx_vbitsel_v(q4, out_l, flat2);
2584 __lsx_vstelm_d(q4, dst, 0, 0);
2585 dst += 16;
2586
2587 /* q5 */
2588 tmp0_l = q7_l_in - q4_l_in;
2589 tmp0_l += q5_l_in;
2590 tmp0_l -= p2_l_in;
2591 tmp1_l += tmp0_l;
2592 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2593 out_l = __lsx_vpickev_b(out_l, out_l);
2594 q5 = __lsx_vbitsel_v(q5, out_l, flat2);
2595 __lsx_vstelm_d(q5, dst, 0, 0);
2596 dst += 16;
2597
2598 /* q6 */
2599 tmp0_l = q7_l_in - q5_l_in;
2600 tmp0_l += q6_l_in;
2601 tmp0_l -= p1_l_in;
2602 tmp1_l += tmp0_l;
2603 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2604 out_l = __lsx_vpickev_b(out_l, out_l);
2605 q6 = __lsx_vbitsel_v(q6, out_l, flat2);
2606 __lsx_vstelm_d(q6, dst, 0, 0);
2607
2608 return 0;
2609 }
2610 }
2611
ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)2612 void ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride,
2613 int32_t b_limit_ptr,
2614 int32_t limit_ptr,
2615 int32_t thresh_ptr)
2616 {
2617 uint8_t early_exit = 0;
2618 uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16)));
2619 uint8_t *filter48 = &transposed_input[16 * 16];
2620
2621 vp9_transpose_16x8_to_8x16(dst - 8, stride, transposed_input);
2622
2623 early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2624 &filter48[0], dst, stride,
2625 b_limit_ptr, limit_ptr, thresh_ptr);
2626
2627 if (0 == early_exit) {
2628 early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), dst, stride,
2629 &filter48[0]);
2630
2631 if (0 == early_exit) {
2632 vp9_transpose_8x16_to_16x8(transposed_input, dst - 8, stride);
2633 }
2634 }
2635 }
2636
vp9_vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, uint8_t *dst_org, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)2637 static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
2638 uint8_t *dst_org, ptrdiff_t stride,
2639 int32_t b_limit_ptr,
2640 int32_t limit_ptr,
2641 int32_t thresh_ptr)
2642 {
2643 ptrdiff_t stride2 = stride << 1;
2644 ptrdiff_t stride3 = stride2 + stride;
2645 ptrdiff_t stride4 = stride2 << 1;
2646 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
2647 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2648 __m128i flat, mask, hev, thresh, b_limit, limit;
2649 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2650 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
2651 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2652 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2653 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
2654 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
2655 __m128i vec0, vec1, vec2, vec3, vec4, vec5;
2656 __m128i zero = __lsx_vldi(0);
2657
2658 /* load vector elements */
2659 DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16,
2660 p3, p2, p1, p0);
2661 DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2662
2663 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
2664 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2665 limit = __lsx_vreplgr2vr_b(limit_ptr);
2666
2667 /* mask and hev */
2668 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2669 hev, mask, flat);
2670 /* flat4 */
2671 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2672 /* filter4 */
2673 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2674 q1_out);
2675
2676 /* if flat is zero for all pixels, then no need to calculate other filter */
2677 if (__lsx_bz_v(flat)) {
2678 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2679 vec2 = __lsx_vilvl_h(vec1, vec0);
2680 vec3 = __lsx_vilvh_h(vec1, vec0);
2681 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2682 vec4 = __lsx_vilvl_h(vec1, vec0);
2683 vec5 = __lsx_vilvh_h(vec1, vec0);
2684
2685 dst_org -= 2;
2686 __lsx_vstelm_w(vec2, dst_org, 0, 0);
2687 __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
2688 __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
2689 __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
2690 dst_org += stride4;
2691 __lsx_vstelm_w(vec3, dst_org, 0, 0);
2692 __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
2693 __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
2694 __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
2695 dst_org += stride4;
2696 __lsx_vstelm_w(vec4, dst_org, 0, 0);
2697 __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
2698 __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
2699 __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
2700 dst_org += stride4;
2701 __lsx_vstelm_w(vec5, dst_org, 0, 0);
2702 __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
2703 __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
2704 __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
2705
2706 return 1;
2707 } else {
2708 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
2709 p3_l, p2_l, p1_l, p0_l);
2710 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
2711 q0_l, q1_l, q2_l, q3_l);
2712 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2713 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2714 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
2715 p3_h, p2_h, p1_h, p0_h);
2716 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
2717 q0_h, q1_h, q2_h, q3_h);
2718 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2719 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2720
2721 /* convert 16 bit output data into 8 bit */
2722 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
2723 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h,
2724 q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
2725 q0_filt8_l);
2726 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
2727 q2_filt8_l, q1_filt8_l, q2_filt8_l);
2728
2729 /* store pixel values */
2730 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
2731 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
2732 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
2733 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
2734 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
2735 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
2736
2737 __lsx_vst(p2_out, filter48, 0);
2738 __lsx_vst(p1_out, filter48, 16);
2739 __lsx_vst(p0_out, filter48, 32);
2740 __lsx_vst(q0_out, filter48, 48);
2741 __lsx_vst(q1_out, filter48, 64);
2742 __lsx_vst(q2_out, filter48, 80);
2743 __lsx_vst(flat, filter48, 96);
2744
2745 return 0;
2746 }
2747 }
2748
vp9_vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, ptrdiff_t stride, uint8_t *filter48)2749 static int32_t vp9_vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org,
2750 ptrdiff_t stride,
2751 uint8_t *filter48)
2752 {
2753 __m128i zero = __lsx_vldi(0);
2754 __m128i flat, flat2, filter8;
2755 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2756 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2757 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2758 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2759 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2760 v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
2761 v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
2762 v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
2763 v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
2764 v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
2765 __m128i out_l, out_h;
2766 uint8_t *dst_tmp = dst - 128;
2767
2768 flat = __lsx_vld(filter48, 96);
2769
2770 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2771 dst_tmp, 48, p7, p6, p5, p4);
2772 DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2773 dst_tmp, 112, p3, p2, p1, p0);
2774 DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2775 DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
2776
2777 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2778
2779 /* if flat2 is zero for all pixels, then no need to calculate other filter */
2780 if (__lsx_bz_v(flat2)) {
2781 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2782
2783 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2784 filter48, 48, p2, p1, p0, q0);
2785 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
2786
2787 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2788 vec3 = __lsx_vilvl_h(vec1, vec0);
2789 vec4 = __lsx_vilvh_h(vec1, vec0);
2790 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
2791 vec6 = __lsx_vilvl_h(vec1, vec0);
2792 vec7 = __lsx_vilvh_h(vec1, vec0);
2793 vec2 = __lsx_vilvl_b(q2, q1);
2794 vec5 = __lsx_vilvh_b(q2, q1);
2795
2796 dst_org -= 3;
2797 __lsx_vstelm_w(vec3, dst_org, 0, 0);
2798 __lsx_vstelm_h(vec2, dst_org, 4, 0);
2799 dst_org += stride;
2800 __lsx_vstelm_w(vec3, dst_org, 0, 1);
2801 __lsx_vstelm_h(vec2, dst_org, 4, 1);
2802 dst_org += stride;
2803 __lsx_vstelm_w(vec3, dst_org, 0, 2);
2804 __lsx_vstelm_h(vec2, dst_org, 4, 2);
2805 dst_org += stride;
2806 __lsx_vstelm_w(vec3, dst_org, 0, 3);
2807 __lsx_vstelm_h(vec2, dst_org, 4, 3);
2808 dst_org += stride;
2809 __lsx_vstelm_w(vec4, dst_org, 0, 0);
2810 __lsx_vstelm_h(vec2, dst_org, 4, 4);
2811 dst_org += stride;
2812 __lsx_vstelm_w(vec4, dst_org, 0, 1);
2813 __lsx_vstelm_h(vec2, dst_org, 4, 5);
2814 dst_org += stride;
2815 __lsx_vstelm_w(vec4, dst_org, 0, 2);
2816 __lsx_vstelm_h(vec2, dst_org, 4, 6);
2817 dst_org += stride;
2818 __lsx_vstelm_w(vec4, dst_org, 0, 3);
2819 __lsx_vstelm_h(vec2, dst_org, 4, 7);
2820 dst_org += stride;
2821 __lsx_vstelm_w(vec6, dst_org, 0, 0);
2822 __lsx_vstelm_h(vec5, dst_org, 4, 0);
2823 dst_org += stride;
2824 __lsx_vstelm_w(vec6, dst_org, 0, 1);
2825 __lsx_vstelm_h(vec5, dst_org, 4, 1);
2826 dst_org += stride;
2827 __lsx_vstelm_w(vec6, dst_org, 0, 2);
2828 __lsx_vstelm_h(vec5, dst_org, 4, 2);
2829 dst_org += stride;
2830 __lsx_vstelm_w(vec6, dst_org, 0, 3);
2831 __lsx_vstelm_h(vec5, dst_org, 4, 3);
2832 dst_org += stride;
2833 __lsx_vstelm_w(vec7, dst_org, 0, 0);
2834 __lsx_vstelm_h(vec5, dst_org, 4, 4);
2835 dst_org += stride;
2836 __lsx_vstelm_w(vec7, dst_org, 0, 1);
2837 __lsx_vstelm_h(vec5, dst_org, 4, 5);
2838 dst_org += stride;
2839 __lsx_vstelm_w(vec7, dst_org, 0, 2);
2840 __lsx_vstelm_h(vec5, dst_org, 4, 6);
2841 dst_org += stride;
2842 __lsx_vstelm_w(vec7, dst_org, 0, 3);
2843 __lsx_vstelm_h(vec5, dst_org, 4, 7);
2844
2845 return 1;
2846 } else {
2847 dst -= 7 * 16;
2848
2849 p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
2850 p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
2851 p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
2852 p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
2853 p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
2854 p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
2855 p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
2856 p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
2857 q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
2858
2859 tmp0_l = p7_l_in << 3;
2860 tmp0_l -= p7_l_in;
2861 tmp0_l += p6_l_in;
2862 tmp0_l += q0_l_in;
2863 tmp1_l = p6_l_in + p5_l_in;
2864 tmp1_l += p4_l_in;
2865 tmp1_l += p3_l_in;
2866 tmp1_l += p2_l_in;
2867 tmp1_l += p1_l_in;
2868 tmp1_l += p0_l_in;
2869 tmp1_l += tmp0_l;
2870 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2871
2872 p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
2873 p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
2874 p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
2875 p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
2876 p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
2877 p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
2878 p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
2879 p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
2880 q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
2881
2882 tmp0_h = p7_h_in << 3;
2883 tmp0_h -= p7_h_in;
2884 tmp0_h += p6_h_in;
2885 tmp0_h += q0_h_in;
2886 tmp1_h = p6_h_in + p5_h_in;
2887 tmp1_h += p4_h_in;
2888 tmp1_h += p3_h_in;
2889 tmp1_h += p2_h_in;
2890 tmp1_h += p1_h_in;
2891 tmp1_h += p0_h_in;
2892 tmp1_h += tmp0_h;
2893 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2894
2895 out_l = __lsx_vpickev_b(out_h, out_l);
2896 p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2897 __lsx_vst(p6, dst, 0);
2898
2899 /* p5 */
2900 q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
2901 tmp0_l = p5_l_in - p6_l_in;
2902 tmp0_l += q1_l_in;
2903 tmp0_l -= p7_l_in;
2904 tmp1_l += tmp0_l;
2905 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2906 q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
2907 tmp0_h = p5_h_in - p6_h_in;
2908 tmp0_h += q1_h_in;
2909 tmp0_h -= p7_h_in;
2910 tmp1_h += tmp0_h;
2911 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2912 out_l = __lsx_vpickev_b(out_h, out_l);
2913 p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2914 __lsx_vst(p5, dst, 16);
2915
2916 /* p4 */
2917 q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
2918 tmp0_l = p4_l_in - p5_l_in;
2919 tmp0_l += q2_l_in;
2920 tmp0_l -= p7_l_in;
2921 tmp1_l += tmp0_l;
2922 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2923 q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
2924 tmp0_h = p4_h_in - p5_h_in;
2925 tmp0_h += q2_h_in;
2926 tmp0_h -= p7_h_in;
2927 tmp1_h += tmp0_h;
2928 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2929 out_l = __lsx_vpickev_b(out_h, out_l);
2930 p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2931 __lsx_vst(p4, dst, 16*2);
2932
2933 /* p3 */
2934 q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
2935 tmp0_l = p3_l_in - p4_l_in;
2936 tmp0_l += q3_l_in;
2937 tmp0_l -= p7_l_in;
2938 tmp1_l += tmp0_l;
2939 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2940 q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
2941 tmp0_h = p3_h_in - p4_h_in;
2942 tmp0_h += q3_h_in;
2943 tmp0_h -= p7_h_in;
2944 tmp1_h += tmp0_h;
2945 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2946 out_l = __lsx_vpickev_b(out_h, out_l);
2947 p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2948 __lsx_vst(p3, dst, 16*3);
2949
2950 /* p2 */
2951 q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
2952 filter8 = __lsx_vld(filter48, 0);
2953 tmp0_l = p2_l_in - p3_l_in;
2954 tmp0_l += q4_l_in;
2955 tmp0_l -= p7_l_in;
2956 tmp1_l += tmp0_l;
2957 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2958 q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
2959 tmp0_h = p2_h_in - p3_h_in;
2960 tmp0_h += q4_h_in;
2961 tmp0_h -= p7_h_in;
2962 tmp1_h += tmp0_h;
2963 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2964 out_l = __lsx_vpickev_b(out_h, out_l);
2965 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2966 __lsx_vst(filter8, dst, 16*4);
2967
2968 /* p1 */
2969 q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
2970 filter8 = __lsx_vld(filter48, 16);
2971 tmp0_l = p1_l_in - p2_l_in;
2972 tmp0_l += q5_l_in;
2973 tmp0_l -= p7_l_in;
2974 tmp1_l += tmp0_l;
2975 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2976 q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
2977 tmp0_h = p1_h_in - p2_h_in;
2978 tmp0_h += q5_h_in;
2979 tmp0_h -= p7_h_in;
2980 tmp1_h += tmp0_h;
2981 out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
2982 out_l = __lsx_vpickev_b(out_h, out_l);
2983 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2984 __lsx_vst(filter8, dst, 16*5);
2985
2986 /* p0 */
2987 q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
2988 filter8 = __lsx_vld(filter48, 32);
2989 tmp0_l = p0_l_in - p1_l_in;
2990 tmp0_l += q6_l_in;
2991 tmp0_l -= p7_l_in;
2992 tmp1_l += tmp0_l;
2993 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2994 q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
2995 tmp0_h = p0_h_in - p1_h_in;
2996 tmp0_h += q6_h_in;
2997 tmp0_h -= p7_h_in;
2998 tmp1_h += tmp0_h;
2999 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3000 out_l = __lsx_vpickev_b(out_h, out_l);
3001 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3002 __lsx_vst(filter8, dst, 16*6);
3003
3004 /* q0 */
3005 q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
3006 filter8 = __lsx_vld(filter48, 48);
3007 tmp0_l = q7_l_in - p0_l_in;
3008 tmp0_l += q0_l_in;
3009 tmp0_l -= p7_l_in;
3010 tmp1_l += tmp0_l;
3011 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3012 q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
3013 tmp0_h = q7_h_in - p0_h_in;
3014 tmp0_h += q0_h_in;
3015 tmp0_h -= p7_h_in;
3016 tmp1_h += tmp0_h;
3017 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3018 out_l = __lsx_vpickev_b(out_h, out_l);
3019 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3020 __lsx_vst(filter8, dst, 16*7);
3021
3022 /* q1 */
3023 filter8 = __lsx_vld(filter48, 64);
3024 tmp0_l = q7_l_in - q0_l_in;
3025 tmp0_l += q1_l_in;
3026 tmp0_l -= p6_l_in;
3027 tmp1_l += tmp0_l;
3028 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3029 tmp0_h = q7_h_in - q0_h_in;
3030 tmp0_h += q1_h_in;
3031 tmp0_h -= p6_h_in;
3032 tmp1_h += tmp0_h;
3033 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3034 out_l = __lsx_vpickev_b(out_h, out_l);
3035 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3036 __lsx_vst(filter8, dst, 16*8);
3037
3038 /* q2 */
3039 filter8 = __lsx_vld(filter48, 80);
3040 tmp0_l = q7_l_in - q1_l_in;
3041 tmp0_l += q2_l_in;
3042 tmp0_l -= p5_l_in;
3043 tmp1_l += tmp0_l;
3044 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3045 tmp0_h = q7_h_in - q1_h_in;
3046 tmp0_h += q2_h_in;
3047 tmp0_h -= p5_h_in;
3048 tmp1_h += tmp0_h;
3049 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3050 out_l = __lsx_vpickev_b(out_h, out_l);
3051 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3052 __lsx_vst(filter8, dst, 16*9);
3053
3054 /* q3 */
3055 tmp0_l = q7_l_in - q2_l_in;
3056 tmp0_l += q3_l_in;
3057 tmp0_l -= p4_l_in;
3058 tmp1_l += tmp0_l;
3059 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3060 tmp0_h = q7_h_in - q2_h_in;
3061 tmp0_h += q3_h_in;
3062 tmp0_h -= p4_h_in;
3063 tmp1_h += tmp0_h;
3064 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3065 out_l = __lsx_vpickev_b(out_h, out_l);
3066 q3 = __lsx_vbitsel_v(q3, out_l, flat2);
3067 __lsx_vst(q3, dst, 16*10);
3068
3069 /* q4 */
3070 tmp0_l = q7_l_in - q3_l_in;
3071 tmp0_l += q4_l_in;
3072 tmp0_l -= p3_l_in;
3073 tmp1_l += tmp0_l;
3074 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3075 tmp0_h = q7_h_in - q3_h_in;
3076 tmp0_h += q4_h_in;
3077 tmp0_h -= p3_h_in;
3078 tmp1_h += tmp0_h;
3079 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3080 out_l = __lsx_vpickev_b(out_h, out_l);
3081 q4 = __lsx_vbitsel_v(q4, out_l, flat2);
3082 __lsx_vst(q4, dst, 16*11);
3083
3084 /* q5 */
3085 tmp0_l = q7_l_in - q4_l_in;
3086 tmp0_l += q5_l_in;
3087 tmp0_l -= p2_l_in;
3088 tmp1_l += tmp0_l;
3089 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3090 tmp0_h = q7_h_in - q4_h_in;
3091 tmp0_h += q5_h_in;
3092 tmp0_h -= p2_h_in;
3093 tmp1_h += tmp0_h;
3094 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3095 out_l = __lsx_vpickev_b(out_h, out_l);
3096 q5 = __lsx_vbitsel_v(q5, out_l, flat2);
3097 __lsx_vst(q5, dst, 16*12);
3098
3099 /* q6 */
3100 tmp0_l = q7_l_in - q5_l_in;
3101 tmp0_l += q6_l_in;
3102 tmp0_l -= p1_l_in;
3103 tmp1_l += tmp0_l;
3104 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3105 tmp0_h = q7_h_in - q5_h_in;
3106 tmp0_h += q6_h_in;
3107 tmp0_h -= p1_h_in;
3108 tmp1_h += tmp0_h;
3109 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3110 out_l = __lsx_vpickev_b(out_h, out_l);
3111 q6 = __lsx_vbitsel_v(q6, out_l, flat2);
3112 __lsx_vst(q6, dst, 16*13);
3113
3114 return 0;
3115 }
3116 }
3117
ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)3118 void ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride,
3119 int32_t b_limit_ptr,
3120 int32_t limit_ptr,
3121 int32_t thresh_ptr)
3122 {
3123 uint8_t early_exit = 0;
3124 uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16)));
3125 uint8_t *filter48 = &transposed_input[16 * 16];
3126
3127 vp9_transpose_16x16((dst - 8), stride, &transposed_input[0], 16);
3128
3129 early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
3130 &filter48[0], dst, stride,
3131 b_limit_ptr, limit_ptr, thresh_ptr);
3132
3133 if (0 == early_exit) {
3134 early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), dst,
3135 stride, &filter48[0]);
3136
3137 if (0 == early_exit) {
3138 vp9_transpose_16x16(transposed_input, 16, (dst - 8), stride);
3139 }
3140 }
3141 }
3142