1 /*
2  * Copyright (c) 2022 Loongson Technology Corporation Limited
3  * Contributed by Lu Wang <wanglu@loongson.cn>
4  *                Hao Chen <chenhao@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 #include "hevcdsp_lsx.h"
25 
ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t beta, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)26 void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
27                                       int32_t beta, int32_t *tc,
28                                       uint8_t *p_is_pcm, uint8_t *q_is_pcm)
29 {
30     ptrdiff_t stride_2x = (stride << 1);
31     ptrdiff_t stride_4x = (stride << 2);
32     ptrdiff_t stride_3x = stride_2x + stride;
33     uint8_t *p3 = src - stride_4x;
34     uint8_t *p2 = src - stride_3x;
35     uint8_t *p1 = src - stride_2x;
36     uint8_t *p0 = src - stride;
37     uint8_t *q0 = src;
38     uint8_t *q1 = src + stride;
39     uint8_t *q2 = src + stride_2x;
40     uint8_t *q3 = src + stride_3x;
41     uint8_t flag0, flag1;
42     int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
43     int32_t dp04, dq04, dp34, dq34, d04, d34;
44     int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
45     int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
46 
47     __m128i dst0, dst1, dst2, dst3, dst4, dst5;
48     __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
49     __m128i temp0, temp1;
50     __m128i temp2, tc_pos, tc_neg;
51     __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
52     __m128i zero = {0};
53     __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
54 
55     dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
56     dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
57     dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
58     dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
59     d00 = dp00 + dq00;
60     d30 = dp30 + dq30;
61     dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
62     dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
63     dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
64     dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
65     d04 = dp04 + dq04;
66     d34 = dp34 + dq34;
67 
68     p_is_pcm0 = p_is_pcm[0];
69     p_is_pcm4 = p_is_pcm[1];
70     q_is_pcm0 = q_is_pcm[0];
71     q_is_pcm4 = q_is_pcm[1];
72 
73     DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
74     p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
75     p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
76     d0030 = (d00 + d30) >= beta;
77     d0434 = (d04 + d34) >= beta;
78     DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
79     cmp3 = __lsx_vpackev_w(cmp1, cmp0);
80     cmp3 = __lsx_vseqi_w(cmp3, 0);
81 
82     if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
83         (!d0030 || !d0434)) {
84         DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
85                   p3_src, p2_src, p1_src, p0_src);
86         DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
87         q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
88         q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
89 
90         tc0 = tc[0];
91         beta30 = beta >> 3;
92         beta20 = beta >> 2;
93         tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
94         tc4 = tc[1];
95         tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
96 
97         DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
98         DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
99                   p0_src, p3_src, p2_src, p1_src, p0_src);
100         DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
101                   q0_src, q1_src, q2_src, q3_src);
102         flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
103                 abs(p0[0] - q0[0]) < tc250;
104         flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
105                 abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
106                 (d30 << 1) < beta20);
107         tc_pos = __lsx_vpackev_d(cmp1, cmp0);
108         DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
109                   zero, q3_src, q0_src, q1_src, q2_src, q3_src);
110 
111         flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
112                 abs(p0[4] - q0[4]) < tc254;
113         flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
114                 abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
115                 (d34 << 1) < beta20);
116         DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
117         cmp2 = __lsx_vpackev_w(cmp1, cmp0);
118         cmp2 = __lsx_vseqi_w(cmp2, 0);
119 
120         if (flag0 && flag1) { /* strong only */
121             /* strong filter */
122             tc_pos = __lsx_vslli_h(tc_pos, 1);
123             tc_neg = __lsx_vneg_h(tc_pos);
124 
125             /* p part */
126             DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
127                       temp0, temp0);
128             temp1 = __lsx_vadd_h(p3_src, p2_src);
129             temp1 = __lsx_vslli_h(temp1, 1);
130             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
131             temp1 = __lsx_vsrari_h(temp1, 3);
132             temp2 = __lsx_vsub_h(temp1, p2_src);
133             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
134             dst0 = __lsx_vadd_h(temp2, p2_src);
135 
136             temp1 = __lsx_vadd_h(temp0, p2_src);
137             temp1 = __lsx_vsrari_h(temp1, 2);
138             temp2 = __lsx_vsub_h(temp1, p1_src);
139             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
140             dst1 = __lsx_vadd_h(temp2, p1_src);
141 
142             temp1 = __lsx_vslli_h(temp0, 1);
143             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
144                       temp1, temp1);
145             temp1 = __lsx_vsrari_h(temp1, 3);
146             temp2 = __lsx_vsub_h(temp1, p0_src);
147             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
148             dst2 = __lsx_vadd_h(temp2, p0_src);
149 
150             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
151             DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
152                       p1_src, p_is_pcm_vec, dst0, dst1);
153             dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
154 
155             /* q part */
156             DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
157                       temp0, temp0);
158             temp1 = __lsx_vadd_h(q3_src, q2_src);
159             temp1 = __lsx_vslli_h(temp1, 1);
160             DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
161             temp1 = __lsx_vsrari_h(temp1, 3);
162             temp2 = __lsx_vsub_h(temp1, q2_src);
163             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
164             dst5 = __lsx_vadd_h(temp2, q2_src);
165 
166             temp1 = __lsx_vadd_h(temp0, q2_src);
167             temp1 = __lsx_vsrari_h(temp1, 2);
168             temp2 = __lsx_vsub_h(temp1, q1_src);
169             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
170             dst4 = __lsx_vadd_h(temp2, q1_src);
171 
172             temp0 = __lsx_vslli_h(temp0, 1);
173             DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
174                       temp1, temp1);
175             temp1 = __lsx_vsrari_h(temp1, 3);
176             temp2 = __lsx_vsub_h(temp1, q0_src);
177             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
178             dst3 = __lsx_vadd_h(temp2, q0_src);
179 
180             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
181             DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
182                       q1_src, q_is_pcm_vec, dst3, dst4);
183             dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
184 
185             /* pack results to 8 bit */
186             DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
187             dst2 = __lsx_vpickev_b(dst5, dst4);
188 
189             /* pack src to 8 bit */
190             DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
191                       dst3, dst4);
192             dst5 = __lsx_vpickev_b(q2_src, q1_src);
193 
194             cmp3 = __lsx_vnor_v(cmp3, cmp3);
195             DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
196                       dst0, dst1);
197             dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
198 
199             __lsx_vstelm_d(dst0, p2, 0, 0);
200             __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
201             __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
202             __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
203             __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
204             __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
205             /* strong filter ends */
206         } else if (flag0 == flag1) { /* weak only */
207             /* weak filter */
208             tc_neg = __lsx_vneg_h(tc_pos);
209             DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
210                       diff0, diff1);
211             DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
212                       __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
213             delta0 = __lsx_vsub_h(diff0, diff1);
214             delta0 = __lsx_vsrari_h(delta0, 4);
215             temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
216                                  __lsx_vslli_h(tc_pos, 1));
217             abs_delta0 = __lsx_vadda_h(delta0, zero);
218             abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
219             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
220 
221             delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
222             temp2 = __lsx_vadd_h(delta0, p0_src);
223             temp2 = __lsx_vclip255_h(temp2);
224             temp0 = __lsx_vbitsel_v(temp2, p0_src,
225                                     __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
226             temp2 = __lsx_vsub_h(q0_src, delta0);
227             temp2 = __lsx_vclip255_h(temp2);
228             temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
229                                     q_is_pcm_vec));
230             DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
231                       q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
232 
233             tmp = (beta + (beta >> 1)) >> 3;
234             DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
235                       cmp0, cmp1);
236             cmp0 = __lsx_vpackev_d(cmp1, cmp0);
237             cmp0 = __lsx_vseqi_d(cmp0, 0);
238             p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
239 
240             DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
241                       cmp0, cmp1);
242             cmp0 = __lsx_vpackev_d(cmp1, cmp0);
243             cmp0 = __lsx_vseqi_d(cmp0, 0);
244             q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
245             tc_pos = __lsx_vsrai_h(tc_pos, 1);
246             tc_neg = __lsx_vneg_h(tc_pos);
247 
248             DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
249                       delta1, delta2);
250             DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
251                       delta1, delta2);
252             delta1 = __lsx_vadd_h(delta1, delta0);
253             delta2 = __lsx_vsub_h(delta2, delta0);
254             DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
255             DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
256                       tc_neg, tc_pos, delta1, delta2);
257             DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
258                       delta1, delta2);
259             DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
260             DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
261                       q1_src, q_is_pcm_vec, delta1, delta2);
262 
263             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
264             DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
265                       p0_src,  abs_delta0, temp2, q0_src, abs_delta0, delta2,
266                       q1_src, abs_delta0, dst1, dst2, dst3, dst4);
267             /* pack results to 8 bit */
268             DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
269             /* pack src to 8 bit */
270             DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
271                       dst2, dst3);
272             cmp3 = __lsx_vnor_v(cmp3, cmp3);
273             DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
274                       dst0, dst1);
275 
276             p2 += stride;
277             __lsx_vstelm_d(dst0, p2, 0, 0);
278             __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
279             __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
280             __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
281             /* weak filter ends */
282         } else { /* strong + weak */
283             /* strong filter */
284             tc_pos = __lsx_vslli_h(tc_pos, 1);
285             tc_neg = __lsx_vneg_h(tc_pos);
286 
287             /* p part */
288             DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
289                       temp0, temp0);
290             temp1 = __lsx_vadd_h(p3_src, p2_src);
291             temp1 = __lsx_vslli_h(temp1, 1);
292             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
293             temp1 = __lsx_vsrari_h(temp1, 3);
294             temp2 = __lsx_vsub_h(temp1, p2_src);
295             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
296             dst0 = __lsx_vadd_h(temp2, p2_src);
297 
298             temp1 = __lsx_vadd_h(temp0, p2_src);
299             temp1 = __lsx_vsrari_h(temp1, 2);
300             temp2 = __lsx_vsub_h(temp1, p1_src);
301             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
302             dst1 = __lsx_vadd_h(temp2, p1_src);
303 
304             temp1 = __lsx_vslli_h(temp0, 1);
305             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
306             temp1 = __lsx_vsrari_h(temp1, 3);
307             temp2 = __lsx_vsub_h(temp1, p0_src);
308             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
309             dst2 = __lsx_vadd_h(temp2, p0_src);
310 
311             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
312             DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
313                       p1_src, p_is_pcm_vec, dst0, dst1);
314             dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
315 
316             /* q part */
317             DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
318                       temp0, temp0);
319             temp1 = __lsx_vadd_h(q3_src, q2_src);
320             temp1 = __lsx_vslli_h(temp1, 1);
321             DUP2_ARG2(__lsx_vadd_h, temp1,  q2_src, temp1, temp0, temp1, temp1);
322             temp1 = __lsx_vsrari_h(temp1, 3);
323             temp2 = __lsx_vsub_h(temp1, q2_src);
324             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
325             dst5 = __lsx_vadd_h(temp2, q2_src);
326 
327             temp1 = __lsx_vadd_h(temp0, q2_src);
328             temp1 = __lsx_vsrari_h(temp1, 2);
329             temp2 = __lsx_vsub_h(temp1, q1_src);
330             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
331             dst4 = __lsx_vadd_h(temp2, q1_src);
332 
333             temp1 = __lsx_vslli_h(temp0, 1);
334             DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
335             temp1 = __lsx_vsrari_h(temp1, 3);
336             temp2 = __lsx_vsub_h(temp1, q0_src);
337             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
338             dst3 = __lsx_vadd_h(temp2, q0_src);
339 
340             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
341             DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
342                       q1_src, q_is_pcm_vec, dst3, dst4);
343             dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
344 
345             /* pack strong results to 8 bit */
346             DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
347             dst2 = __lsx_vpickev_b(dst5, dst4);
348             /* strong filter ends */
349 
350             /* weak filter */
351             tc_pos = __lsx_vsrai_h(tc_pos, 1);
352             tc_neg = __lsx_vneg_h(tc_pos);
353 
354             DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
355                       diff0, diff1);
356             DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
357                       __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
358             delta0 = __lsx_vsub_h(diff0, diff1);
359             delta0 = __lsx_vsrari_h(delta0, 4);
360             temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
361                                  __lsx_vslli_h(tc_pos, 1));
362             abs_delta0 = __lsx_vadda_h(delta0, zero);
363             abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
364             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
365 
366             delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
367             temp2 = __lsx_vadd_h(delta0, p0_src);
368             temp2 = __lsx_vclip255_h(temp2);
369             temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
370 
371             temp2 = __lsx_vsub_h(q0_src, delta0);
372             temp2 = __lsx_vclip255_h(temp2);
373             temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
374 
375             tmp = (beta + (beta >> 1)) >> 3;
376             DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
377                       cmp0, cmp1);
378             cmp0 = __lsx_vpackev_d(cmp1, cmp0);
379             p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
380             DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
381                       cmp0, cmp1);
382             cmp0 = __lsx_vpackev_d(cmp1, cmp0);
383             q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
384 
385             tc_pos = __lsx_vsrai_h(tc_pos, 1);
386             tc_neg = __lsx_vneg_h(tc_pos);
387 
388             DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
389                       delta1, delta2);
390             DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
391                       delta1, delta2);
392             delta1 = __lsx_vadd_h(delta1, delta0);
393             delta2 = __lsx_vsub_h(delta2, delta0);
394             DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
395             DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
396                       tc_pos, delta1, delta2);
397             DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
398                       delta1, delta2);
399             DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
400             DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
401                       q1_src, q_is_pcm_vec, delta1, delta2);
402             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
403             DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
404                       q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
405                       q0_src, abs_delta0, delta1, delta2, temp0, temp2);
406             /* weak filter ends */
407 
408             /* pack weak results to 8 bit */
409             DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
410                       dst3, dst4);
411             dst5 = __lsx_vpickev_b(q2_src, delta2);
412 
413             /* select between weak or strong */
414             DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
415                       dst0, dst1);
416             dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
417 
418             /* pack src to 8 bit */
419             DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
420                       dst3, dst4);
421             dst5 = __lsx_vpickev_b(q2_src, q1_src);
422 
423             cmp3 = __lsx_vnor_v(cmp3, cmp3);
424             DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
425                       dst0, dst1);
426             dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
427 
428             __lsx_vstelm_d(dst0, p2, 0, 0);
429             __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
430             __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
431             __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
432             __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
433             __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
434         }
435     }
436 }
437 
ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t beta, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)438 void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
439                                       int32_t beta, int32_t *tc,
440                                       uint8_t *p_is_pcm, uint8_t *q_is_pcm)
441 {
442     ptrdiff_t stride_2x = (stride << 1);
443     ptrdiff_t stride_4x = (stride << 2);
444     ptrdiff_t stride_3x = stride_2x + stride;
445     uint8_t *p3 = src;
446     uint8_t *p2 = src + stride_3x;
447     uint8_t *p1 = src + stride_4x;
448     uint8_t *p0 = src + stride_4x + stride_3x;
449     uint8_t flag0, flag1;
450     int32_t dp00, dq00, dp30, dq30, d00, d30;
451     int32_t d0030, d0434;
452     int32_t dp04, dq04, dp34, dq34, d04, d34;
453     int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
454     int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
455 
456     __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
457     __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
458     __m128i cmp3;
459     __m128i temp0, temp1;
460     __m128i temp2;
461     __m128i tc_pos, tc_neg;
462     __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
463     __m128i zero = {0};
464     __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
465 
466     dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
467     dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
468     dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
469     dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
470     d00 = dp00 + dq00;
471     d30 = dp30 + dq30;
472     p_is_pcm0 = p_is_pcm[0];
473     q_is_pcm0 = q_is_pcm[0];
474 
475     dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
476     dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
477     dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
478     dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
479     d04 = dp04 + dq04;
480     d34 = dp34 + dq34;
481     p_is_pcm4 = p_is_pcm[1];
482     q_is_pcm4 = q_is_pcm[1];
483 
484     DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
485     p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
486     p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
487 
488     d0030 = (d00 + d30) >= beta;
489     d0434 = (d04 + d34) >= beta;
490 
491     DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
492     cmp3 = __lsx_vpackev_d(cmp1, cmp0);
493     cmp3 = __lsx_vseqi_d(cmp3, 0);
494 
495     if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
496         (!d0030 || !d0434)) {
497         src -= 4;
498         DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
499                   src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
500         src += stride_4x;
501         DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
502                   src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
503         src -= stride_4x;
504 
505         DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
506         q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
507         q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
508 
509         tc0 = tc[0];
510         beta30 = beta >> 3;
511         beta20 = beta >> 2;
512         tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
513         tc4 = tc[1];
514         tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
515         DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
516         tc_pos = __lsx_vpackev_d(cmp1, cmp0);
517         LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
518                            q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
519                            q0_src, q1_src, q2_src, q3_src);
520 
521         flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
522                 abs(p3[-1] - p3[0]) < tc250;
523         flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
524                 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
525                 (d30 << 1) < beta20);
526         cmp0 = __lsx_vreplgr2vr_d(flag0);
527         DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
528                   p0_src, p3_src, p2_src, p1_src, p0_src);
529 
530         flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
531                 abs(p1[-1] - p1[0]) < tc254;
532         flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
533                 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
534                 (d34 << 1) < beta20);
535         DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
536                   q3_src, q0_src, q1_src, q2_src, q3_src);
537 
538         cmp1 = __lsx_vreplgr2vr_d(flag1);
539         cmp2 = __lsx_vpackev_d(cmp1, cmp0);
540         cmp2 = __lsx_vseqi_d(cmp2, 0);
541 
542         if (flag0 && flag1) { /* strong only */
543             /* strong filter */
544             tc_neg = __lsx_vneg_h(tc_pos);
545             /* p part */
546             DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
547                       temp0, temp0);
548             temp1 = __lsx_vadd_h(p3_src, p2_src);
549             temp1 = __lsx_vslli_h(temp1, 1);
550             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
551             temp1 = __lsx_vsrari_h(temp1, 3);
552             temp2 = __lsx_vsub_h(temp1, p2_src);
553             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
554             dst0 = __lsx_vadd_h(temp2, p2_src);
555 
556             temp1 = __lsx_vadd_h(temp0, p2_src);
557             temp1 = __lsx_vsrari_h(temp1, 2);
558             temp2 = __lsx_vsub_h(temp1, p1_src);
559             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
560             dst1 = __lsx_vadd_h(temp2, p1_src);
561 
562             temp1 = __lsx_vslli_h(temp0, 1);
563             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
564             temp1 = __lsx_vsrari_h(temp1, 3);
565             temp2 = __lsx_vsub_h(temp1, p0_src);
566             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
567             dst2 = __lsx_vadd_h(temp2, p0_src);
568 
569             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
570             DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
571                       p_is_pcm_vec, dst0, dst1);
572             dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
573 
574             /* q part */
575             DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
576                       temp0, temp0);
577             temp1 = __lsx_vadd_h(q3_src, q2_src);
578             temp1 = __lsx_vslli_h(temp1, 1);
579             DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
580             temp1 = __lsx_vsrari_h(temp1, 3);
581             temp2 = __lsx_vsub_h(temp1, q2_src);
582             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
583             dst5 = __lsx_vadd_h(temp2, q2_src);
584 
585             temp1 = __lsx_vadd_h(temp0, q2_src);
586             temp1 = __lsx_vsrari_h(temp1, 2);
587             temp2 = __lsx_vsub_h(temp1, q1_src);
588             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
589             dst4 = __lsx_vadd_h(temp2, q1_src);
590 
591             temp1 = __lsx_vslli_h(temp0, 1);
592             DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
593             temp1 = __lsx_vsrari_h(temp1, 3);
594             temp2 = __lsx_vsub_h(temp1, q0_src);
595             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
596             dst3 = __lsx_vadd_h(temp2, q0_src);
597 
598             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
599             DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
600                       q_is_pcm_vec, dst3, dst4);
601             dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
602             /* strong filter ends */
603         } else if (flag0 == flag1) { /* weak only */
604             /* weak filter */
605             tc_pos = __lsx_vsrai_h(tc_pos, 1);
606             tc_neg = __lsx_vneg_h(tc_pos);
607 
608             DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
609                       diff0, diff1);
610             DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
611                       __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
612             delta0 = __lsx_vsub_h(diff0, diff1);
613             delta0 = __lsx_vsrari_h(delta0, 4);
614             temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
615                                  __lsx_vslli_h(tc_pos, 1));
616             abs_delta0 = __lsx_vadda_h(delta0, zero);
617             abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
618             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
619 
620             delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
621             temp2 = __lsx_vadd_h(delta0, p0_src);
622             temp2 = __lsx_vclip255_h(temp2);
623             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
624             temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
625 
626             temp2 = __lsx_vsub_h(q0_src, delta0);
627             temp2 = __lsx_vclip255_h(temp2);
628             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
629             temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
630 
631             tmp = ((beta + (beta >> 1)) >> 3);
632             DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
633                       !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
634             p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
635             p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
636 
637             DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
638                       (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
639             q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
640             q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
641             tc_pos = __lsx_vsrai_h(tc_pos, 1);
642             tc_neg = __lsx_vneg_h(tc_pos);
643 
644             DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
645                       delta1, delta2);
646             DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
647                       delta1, delta2);
648             delta1 = __lsx_vadd_h(delta1, delta0);
649             delta2 = __lsx_vsub_h(delta2, delta0);
650             DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
651             DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
652                       tc_pos, delta1, delta2);
653             DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
654                       delta1, delta2);
655             DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
656             DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
657                       q1_src, q_is_pcm_vec, delta1, delta2);
658 
659             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
660             DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
661                       p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
662                       q1_src, abs_delta0, dst0, dst1, dst2, dst3);
663             /* weak filter ends */
664 
665             cmp3 = __lsx_vnor_v(cmp3, cmp3);
666             DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
667                       cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
668                       dst0, dst1, dst2, dst3);
669             DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
670 
671             /* transpose */
672             dst4 = __lsx_vilvl_b(dst1, dst0);
673             dst5 = __lsx_vilvh_b(dst1, dst0);
674             dst0 = __lsx_vilvl_h(dst5, dst4);
675             dst1 = __lsx_vilvh_h(dst5, dst4);
676 
677             src += 2;
678             __lsx_vstelm_w(dst0, src, 0, 0);
679             __lsx_vstelm_w(dst0, src + stride, 0, 1);
680             __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
681             __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
682             src += stride_4x;
683             __lsx_vstelm_w(dst1, src, 0, 0);
684             __lsx_vstelm_w(dst1, src + stride, 0, 1);
685             __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
686             __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
687             return;
688         } else { /* strong + weak */
689             /* strong filter */
690             tc_neg = __lsx_vneg_h(tc_pos);
691 
692             /* p part */
693             DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
694                       temp0, temp0);
695 
696             temp1 = __lsx_vadd_h(p3_src, p2_src);
697             temp1 = __lsx_vslli_h(temp1, 1);
698             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
699             temp1 = __lsx_vsrari_h(temp1, 3);
700             temp2 = __lsx_vsub_h(temp1, p2_src);
701             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
702             dst0 = __lsx_vadd_h(temp2, p2_src);
703 
704             temp1 = __lsx_vadd_h(temp0, p2_src);
705             temp1 = __lsx_vsrari_h(temp1, 2);
706             temp2 = __lsx_vsub_h(temp1, p1_src);
707             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
708             dst1 = __lsx_vadd_h(temp2, p1_src);
709 
710             temp1 = __lsx_vslli_h(temp0, 1);
711             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
712             temp1 = __lsx_vsrari_h(temp1, 3);
713             temp2 = __lsx_vsub_h(temp1, p0_src);
714             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
715             dst2 = __lsx_vadd_h(temp2, p0_src);
716 
717             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
718             DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
719                       p_is_pcm_vec, dst0, dst1);
720             dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
721 
722             /* q part */
723             DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
724             temp1 = __lsx_vadd_h(q3_src, q2_src);
725             temp1 = __lsx_vslli_h(temp1, 1);
726             DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
727             temp1 = __lsx_vsrari_h(temp1, 3);
728             temp2 = __lsx_vsub_h(temp1, q2_src);
729             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
730             dst5 = __lsx_vadd_h(temp2, q2_src);
731 
732             temp1 = __lsx_vadd_h(temp0, q2_src);
733             temp1 = __lsx_vsrari_h(temp1, 2);
734             temp2 = __lsx_vsub_h(temp1, q1_src);
735             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
736             dst4 = __lsx_vadd_h(temp2, q1_src);
737 
738             temp1 = __lsx_vslli_h(temp0, 1);
739             DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
740             temp1 = __lsx_vsrari_h(temp1, 3);
741             temp2 = __lsx_vsub_h(temp1, q0_src);
742             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
743             dst3 = __lsx_vadd_h(temp2, q0_src);
744 
745             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
746             DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
747                       q_is_pcm_vec, dst3, dst4);
748             dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
749             /* strong filter ends */
750 
751             /* weak filter */
752             tc_pos = __lsx_vsrai_h(tc_pos, 1);
753             tc_neg = __lsx_vneg_h(tc_pos);
754 
755             DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
756                       diff0, diff1);
757             DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
758                       __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
759             delta0 = __lsx_vsub_h(diff0, diff1);
760             delta0 = __lsx_vsrari_h(delta0, 4);
761 
762             temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
763                     __lsx_vslli_h(tc_pos, 1));
764             abs_delta0 = __lsx_vadda_h(delta0, zero);
765             abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
766             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
767             delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
768             temp2 = __lsx_vadd_h(delta0, p0_src);
769             temp2 = __lsx_vclip255_h(temp2);
770             temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
771             temp2 = __lsx_vsub_h(q0_src, delta0);
772             temp2 = __lsx_vclip255_h(temp2);
773             temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
774 
775             tmp = (beta + (beta >> 1)) >> 3;
776             DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
777                       !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
778             p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
779             p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
780 
781             DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
782                       (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
783             q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
784             q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
785             tc_pos = __lsx_vsrai_h(tc_pos, 1);
786             tc_neg = __lsx_vneg_h(tc_pos);
787 
788             DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
789                       delta1, delta2);
790             DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
791                       delta1, delta2);
792             delta1 = __lsx_vadd_h(delta1, delta0);
793             delta2 = __lsx_vsub_h(delta2, delta0);
794             DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
795             DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
796                       tc_pos, delta1, delta2);
797             DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
798                       delta1, delta2);
799             DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
800             DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
801                       q1_src, q_is_pcm_vec, delta1, delta2);
802 
803             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
804             DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
805                       q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
806                       q0_src, abs_delta0, delta1, delta2, temp0, temp2);
807             /* weak filter ends*/
808 
809             /* select between weak or strong */
810             DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
811                       cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
812                       dst0, dst1, dst2, dst3);
813             DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
814                       dst4, dst5);
815         }
816 
817         cmp3 = __lsx_vnor_v(cmp3, cmp3);
818         DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
819                   p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
820         DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
821                   dst4, dst5);
822 
823         /* pack results to 8 bit */
824         DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
825                   dst5, dst0, dst1, dst2, dst3);
826 
827         /* transpose */
828         DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
829         DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
830         DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
831         DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
832 
833         src += 1;
834         __lsx_vstelm_w(dst0, src, 0, 0);
835         __lsx_vstelm_h(dst2, src, 4, 0);
836         src += stride;
837         __lsx_vstelm_w(dst0, src, 0, 1);
838         __lsx_vstelm_h(dst2, src, 4, 2);
839         src += stride;
840 
841         __lsx_vstelm_w(dst0, src, 0, 2);
842         __lsx_vstelm_h(dst2, src, 4, 4);
843         src += stride;
844         __lsx_vstelm_w(dst0, src, 0, 3);
845         __lsx_vstelm_h(dst2, src, 4, 6);
846         src += stride;
847 
848         __lsx_vstelm_w(dst1, src, 0, 0);
849         __lsx_vstelm_h(dst3, src, 4, 0);
850         src += stride;
851         __lsx_vstelm_w(dst1, src, 0, 1);
852         __lsx_vstelm_h(dst3, src, 4, 2);
853         src += stride;
854 
855         __lsx_vstelm_w(dst1, src, 0, 2);
856         __lsx_vstelm_h(dst3, src, 4, 4);
857         src += stride;
858         __lsx_vstelm_w(dst1, src, 0, 3);
859         __lsx_vstelm_h(dst3, src, 4, 6);
860     }
861 }
862 
ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)863 void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
864                                         int32_t *tc, uint8_t *p_is_pcm,
865                                         uint8_t *q_is_pcm)
866 {
867     uint8_t *p1_ptr = src - (stride << 1);
868     uint8_t *p0_ptr = src - stride;
869     uint8_t *q0_ptr = src;
870     uint8_t *q1_ptr = src + stride;
871     __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
872     __m128i p1, p0, q0, q1;
873     __m128i tc_pos, tc_neg;
874     __m128i zero = {0};
875     __m128i temp0, temp1, delta;
876 
877     if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
878         DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
879         tc_pos = __lsx_vpackev_d(cmp1, cmp0);
880         tc_neg = __lsx_vneg_h(tc_pos);
881         DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
882         p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
883         p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
884 
885         DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
886         q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
887         q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
888 
889         DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
890                   p1, p0, q0, q1);
891         DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
892                   p1, p0, q0, q1);
893         DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
894         temp0 = __lsx_vslli_h(temp0, 2);
895         temp0 = __lsx_vadd_h(temp0, temp1);
896         delta = __lsx_vsrari_h(temp0, 3);
897         delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
898         temp0 = __lsx_vadd_h(p0, delta);
899         temp0 = __lsx_vclip255_h(temp0);
900         p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
901         temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
902 
903         temp1 = __lsx_vsub_h(q0, delta);
904         temp1 = __lsx_vclip255_h(temp1);
905         q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
906         temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
907 
908         tc_pos = __lsx_vslei_d(tc_pos, 0);
909         DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
910                   temp0, temp1);
911         temp0 = __lsx_vpickev_b(temp1, temp0);
912         __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
913         __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
914     }
915 }
916 
ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)917 void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
918                                         int32_t *tc, uint8_t *p_is_pcm,
919                                         uint8_t *q_is_pcm)
920 {
921     ptrdiff_t stride_2x = (stride << 1);
922     ptrdiff_t stride_4x = (stride << 2);
923     ptrdiff_t stride_3x = stride_2x + stride;
924     __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925     __m128i src0, src1, src2, src3, src4, src5, src6, src7;
926     __m128i p1, p0, q0, q1;
927     __m128i tc_pos, tc_neg;
928     __m128i zero = {0};
929     __m128i temp0, temp1, delta;
930 
931     if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
932         DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
933         tc_pos = __lsx_vpackev_d(cmp1, cmp0);
934         tc_neg = __lsx_vneg_h(tc_pos);
935 
936         DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
937         p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
938         p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
939         DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
940         q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
941         q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
942 
943         src -= 2;
944         DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
945                   src + stride_3x, 0, src0, src1, src2, src3);
946         src += stride_4x;
947         DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
948                   src + stride_3x, 0, src4, src5, src6, src7);
949         src -= stride_4x;
950         LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
951                            p1, p0, q0, q1);
952         DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
953                   p1, p0, q0, q1);
954 
955         DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
956         temp0 = __lsx_vslli_h(temp0, 2);
957         temp0 = __lsx_vadd_h(temp0, temp1);
958         delta = __lsx_vsrari_h(temp0, 3);
959         delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
960 
961         temp0 = __lsx_vadd_h(p0, delta);
962         temp1 = __lsx_vsub_h(q0, delta);
963         DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
964         DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
965                   q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
966         DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
967                   q_is_pcm_vec, temp0, temp1);
968 
969         tc_pos = __lsx_vslei_d(tc_pos, 0);
970         DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
971                   temp0, temp1);
972         temp0 = __lsx_vpackev_b(temp1, temp0);
973 
974         src += 1;
975         __lsx_vstelm_h(temp0, src, 0, 0);
976         __lsx_vstelm_h(temp0, src + stride, 0, 1);
977         __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
978         __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
979         src += stride_4x;
980         __lsx_vstelm_h(temp0, src, 0, 4);
981         __lsx_vstelm_h(temp0, src + stride, 0, 5);
982         __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
983         __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
984         src -= stride_4x;
985     }
986 }
987 
hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)988 static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst,
989                                                     int32_t dst_stride,
990                                                     uint8_t *src,
991                                                     int32_t src_stride,
992                                                     int16_t *sao_offset_val,
993                                                     int32_t height)
994 {
995     const int32_t src_stride_2x = (src_stride << 1);
996     const int32_t dst_stride_2x = (dst_stride << 1);
997     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
998     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
999     __m128i edge_idx = {0x403000201, 0x0};
1000     __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1001     __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1002     __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
1003     __m128i const1 = __lsx_vldi(1);
1004     __m128i zero = {0};
1005 
1006     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1007     src -= 1;
1008 
1009     /* load in advance */
1010     DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1011 
1012     for (height -= 2; height; height -= 2) {
1013         src += src_stride_2x;
1014         src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1015         src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1016         src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1017 
1018         DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1019                   cmp_minus10, cmp_minus11);
1020         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1021                   cmp_minus11, diff_minus10, diff_minus11);
1022         DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1023                   cmp_minus10, cmp_minus11);
1024         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1025                   cmp_minus11, cmp_minus10, cmp_minus11);
1026         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1027         diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1028 
1029         offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1030         offset = __lsx_vaddi_bu(offset, 2);
1031 
1032         /* load in advance */
1033         DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1034                   src_minus10, src_minus11);
1035         DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
1036                   sao_offset, sao_offset, offset, offset, offset);
1037         src0 = __lsx_vxori_b(src0, 128);
1038         dst0 = __lsx_vsadd_b(src0, offset);
1039         dst0 = __lsx_vxori_b(dst0, 128);
1040 
1041         __lsx_vstelm_w(dst0, dst, 0, 0);
1042         __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1043         dst += dst_stride_2x;
1044     }
1045 
1046     src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1047     src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1048     src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1049 
1050     DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1051               cmp_minus11);
1052     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1053               diff_minus10, diff_minus11);
1054     DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1055               cmp_minus11);
1056     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1057               cmp_minus10, cmp_minus11);
1058     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1059               const1, cmp_minus11, diff_minus10, diff_minus11);
1060 
1061     offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1062     offset = __lsx_vaddi_bu(offset, 2);
1063     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
1064               offset, offset, offset);
1065     src0 = __lsx_vxori_b(src0, 128);
1066     dst0 = __lsx_vsadd_b(src0, offset);
1067     dst0 = __lsx_vxori_b(dst0, 128);
1068 
1069     __lsx_vstelm_w(dst0, dst, 0, 0);
1070     __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1071 }
1072 
hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1073 static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst,
1074                                                     int32_t dst_stride,
1075                                                     uint8_t *src,
1076                                                     int32_t src_stride,
1077                                                     int16_t *sao_offset_val,
1078                                                     int32_t height)
1079 {
1080     const int32_t src_stride_2x = (src_stride << 1);
1081     const int32_t dst_stride_2x = (dst_stride << 1);
1082     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1083     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1084     __m128i edge_idx = {0x403000201, 0x0};
1085     __m128i const1 = __lsx_vldi(1);
1086     __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1087     __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1088     __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1089     __m128i zeros = {0};
1090 
1091     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1092     src -= 1;
1093 
1094     /* load in advance */
1095     DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1096 
1097     for (height -= 2; height; height -= 2) {
1098         src += src_stride_2x;
1099         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
1100                   src_minus11, shuf1, src0, src1);
1101         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
1102                   src_minus11, shuf2, src_plus10, src_plus11);
1103         DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1104                   src_plus10, src_minus10, src_plus10);
1105         src0 = __lsx_vpickev_d(src1, src0);
1106 
1107         DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1108                   cmp_minus10, cmp_minus11);
1109         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1110                   cmp_minus11, diff_minus10, diff_minus11);
1111         DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1112                   cmp_minus10, cmp_minus11);
1113         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1114                   cmp_minus11, cmp_minus10, cmp_minus11);
1115         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1116         diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1117 
1118         offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1119         offset = __lsx_vaddi_bu(offset, 2);
1120 
1121         /* load in advance */
1122         DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1123                   src_minus10, src_minus11);
1124         DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1125                   sao_offset, offset, offset, offset);
1126         src0 = __lsx_vxori_b(src0, 128);
1127         dst0 = __lsx_vsadd_b(src0, offset);
1128         dst0 = __lsx_vxori_b(dst0, 128);
1129 
1130         __lsx_vstelm_d(dst0, dst, 0, 0);
1131         __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1132         dst += dst_stride_2x;
1133     }
1134 
1135     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
1136               shuf1, src0, src1);
1137     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
1138               shuf2, src_plus10, src_plus11);
1139     DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1140               src_plus10, src_minus10, src_plus10);
1141     src0 =  __lsx_vpickev_d(src1, src0);
1142 
1143     DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1144               cmp_minus11);
1145     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1146               diff_minus10, diff_minus11);
1147     DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1148               cmp_minus11);
1149     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1150               cmp_minus10, cmp_minus11);
1151     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1152               const1, cmp_minus11, diff_minus10, diff_minus11);
1153 
1154     offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1155     offset = __lsx_vaddi_bu(offset, 2);
1156     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1157               sao_offset, offset, offset, offset);
1158     src0 = __lsx_vxori_b(src0, 128);
1159     dst0 = __lsx_vsadd_b(src0, offset);
1160     dst0 = __lsx_vxori_b(dst0, 128);
1161 
1162     __lsx_vstelm_d(dst0, dst, 0, 0);
1163     __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1164 }
1165 
hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)1166 static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst,
1167                                                         int32_t dst_stride,
1168                                                         uint8_t *src,
1169                                                         int32_t src_stride,
1170                                                         int16_t *sao_offset_val,
1171                                                         int32_t width,
1172                                                         int32_t height)
1173 {
1174     uint8_t *dst_ptr, *src_minus1;
1175     int32_t v_cnt;
1176     const int32_t src_stride_2x = (src_stride << 1);
1177     const int32_t dst_stride_2x = (dst_stride << 1);
1178     const int32_t src_stride_4x = (src_stride << 2);
1179     const int32_t dst_stride_4x = (dst_stride << 2);
1180     const int32_t src_stride_3x = src_stride_2x + src_stride;
1181     const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1182 
1183     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1184     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1185     __m128i edge_idx = {0x403000201, 0x0};
1186     __m128i const1 = __lsx_vldi(1);
1187     __m128i sao_offset;
1188     __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1189     __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1190     __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1191     __m128i diff_plus13;
1192     __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1193     __m128i src_minus10, src_minus11, src_minus12, src_minus13;
1194     __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1195     __m128i src_zero0, src_zero1, src_zero2, src_zero3;
1196     __m128i src_plus10, src_plus11, src_plus12, src_plus13;
1197 
1198     sao_offset = __lsx_vld(sao_offset_val, 0);
1199     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1200 
1201     for (; height; height -= 4) {
1202         src_minus1 = src - 1;
1203         src_minus10 = __lsx_vld(src_minus1, 0);
1204         DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1205                   src_stride_2x, src_minus11, src_minus12);
1206         src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
1207 
1208         for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1209             src_minus1 += 16;
1210             dst_ptr = dst + v_cnt;
1211             src10 = __lsx_vld(src_minus1, 0);
1212             DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1213                       src_stride_2x, src11, src12);
1214             src13 = __lsx_vldx(src_minus1, src_stride_3x);
1215             DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
1216                       src_minus11, shuf1, src12, src_minus12, shuf1, src13,
1217                       src_minus13, shuf1, src_zero0, src_zero1,
1218                       src_zero2, src_zero3);
1219             DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
1220                       src_minus11, shuf2, src12, src_minus12, shuf2, src13,
1221                       src_minus13, shuf2, src_plus10, src_plus11,
1222                       src_plus12, src_plus13);
1223             DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1224                       src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1225                       cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1226             DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1227                       src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1228                       cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1229             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1230                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1231                       cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1232                       diff_plus11);
1233             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1234                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1235                       cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1236                       diff_plus13);
1237             DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1238                       src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1239                       cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1240             DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1241                       src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1242                       cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1243             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1244                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1245                       cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1246                       cmp_plus11);
1247             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1248                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1249                       cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1250                       cmp_plus13);
1251             DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1252                       diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1253                       cmp_minus11, diff_plus11, const1, cmp_plus11,
1254                       diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1255             DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1256                       diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1257                       cmp_minus13, diff_plus13, const1, cmp_plus13,
1258                       diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1259 
1260             DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1261                       diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1262                       diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1263                       offset_mask3);
1264             DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1265                       offset_mask2, 2, offset_mask3, 2, offset_mask0,
1266                       offset_mask1, offset_mask2, offset_mask3);
1267             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1268                       sao_offset, sao_offset, offset_mask0, offset_mask0,
1269                       offset_mask0);
1270             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1271                       sao_offset, sao_offset, offset_mask1, offset_mask1,
1272                       offset_mask1);
1273             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1274                       sao_offset, sao_offset, offset_mask2, offset_mask2,
1275                       offset_mask2);
1276             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1277                       sao_offset, sao_offset, offset_mask3, offset_mask3,
1278                       offset_mask3);
1279 
1280             DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
1281                       src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
1282                       src_zero2, src_zero3);
1283             DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1284                       offset_mask1, src_zero2, offset_mask2, src_zero3,
1285                       offset_mask3, dst0, dst1, dst2, dst3);
1286             DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1287                       128, dst0, dst1, dst2, dst3);
1288 
1289             src_minus10 = src10;
1290             src_minus11 = src11;
1291             src_minus12 = src12;
1292             src_minus13 = src13;
1293 
1294             __lsx_vst(dst0, dst_ptr, 0);
1295             __lsx_vst(dst1, dst_ptr + dst_stride, 0);
1296             __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
1297             __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
1298         }
1299         src += src_stride_4x;
1300         dst += dst_stride_4x;
1301     }
1302 }
1303 
hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1304 static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst,
1305                                                      int32_t dst_stride,
1306                                                      uint8_t *src,
1307                                                      int32_t src_stride,
1308                                                      int16_t *sao_offset_val,
1309                                                      int32_t height)
1310 {
1311     const int32_t src_stride_2x = (src_stride << 1);
1312     const int32_t dst_stride_2x = (dst_stride << 1);
1313     __m128i edge_idx = {0x403000201, 0x0};
1314     __m128i const1 = __lsx_vldi(1);
1315     __m128i dst0;
1316     __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1317     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1318     __m128i src_minus10, src_minus11, src10, src11;
1319     __m128i src_zero0, src_zero1;
1320     __m128i offset;
1321     __m128i offset_mask0, offset_mask1;
1322 
1323     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1324 
1325     /* load in advance */
1326     DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
1327               src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
1328 
1329     for (height -= 2; height; height -= 2) {
1330         src += src_stride_2x;
1331         DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1332                   src11, src_minus11, src10, src10, src_minus10, src_zero0,
1333                   src_minus11, src_zero1);
1334         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1335                   cmp_minus10, cmp_minus11);
1336         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1337                   cmp_minus11, diff_minus10, diff_minus11);
1338         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1339                   src_minus11, cmp_minus10, cmp_minus11);
1340         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1341                   cmp_minus11, cmp_minus10, cmp_minus11);
1342         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1343                  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1344 
1345         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1346                   diff_minus11, offset_mask0, offset_mask1);
1347         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1348                   offset_mask0, offset_mask1);
1349         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1350                   src_zero0, offset, dst0);
1351         DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1352                   sao_offset, offset, offset, offset);
1353 
1354         dst0 = __lsx_vxori_b(dst0, 128);
1355         dst0 = __lsx_vsadd_b(dst0, offset);
1356         dst0 = __lsx_vxori_b(dst0, 128);
1357         src_minus10 = src10;
1358         src_minus11 = src11;
1359 
1360         /* load in advance */
1361         DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1362                   src10, src11);
1363 
1364         __lsx_vstelm_w(dst0, dst, 0, 0);
1365         __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1366         dst += dst_stride_2x;
1367     }
1368 
1369     DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1370               src11,  src_minus11, src10, src10, src_minus10, src_zero0,
1371               src_minus11, src_zero1);
1372     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1373               cmp_minus10, cmp_minus11);
1374     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1375               diff_minus10, diff_minus11);
1376     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1377               cmp_minus10, cmp_minus11);
1378     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1379               cmp_minus10, cmp_minus11);
1380     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1381               const1, cmp_minus11, diff_minus10, diff_minus11);
1382 
1383     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1384               diff_minus11, offset_mask0, offset_mask1);
1385     DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1386               offset_mask0, offset_mask1);
1387     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1388               src_zero0, offset, dst0);
1389     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1390               sao_offset, offset, offset, offset);
1391     dst0 = __lsx_vxori_b(dst0, 128);
1392     dst0 = __lsx_vsadd_b(dst0, offset);
1393     dst0 = __lsx_vxori_b(dst0, 128);
1394 
1395     __lsx_vstelm_w(dst0, dst, 0, 0);
1396     __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1397 }
1398 
hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1399 static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst,
1400                                                      int32_t dst_stride,
1401                                                      uint8_t *src,
1402                                                      int32_t src_stride,
1403                                                      int16_t *sao_offset_val,
1404                                                      int32_t height)
1405 {
1406     const int32_t src_stride_2x = (src_stride << 1);
1407     const int32_t dst_stride_2x = (dst_stride << 1);
1408     __m128i edge_idx = {0x403000201, 0x0};
1409     __m128i const1 = __lsx_vldi(1);
1410     __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1411     __m128i src_zero0, src_zero1, dst0;
1412     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1413     __m128i src_minus10, src_minus11, src10, src11;
1414     __m128i offset_mask0, offset_mask1;
1415 
1416     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1417 
1418     /* load in advance */
1419     DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
1420     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
1421 
1422     for (height -= 2; height; height -= 2) {
1423         src += src_stride_2x;
1424         DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1425                   src11, src_minus11, src10, src10, src_minus10, src_zero0,
1426                   src_minus11, src_zero1);
1427         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1428                   cmp_minus10, cmp_minus11);
1429         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1430                   cmp_minus11, diff_minus10, diff_minus11);
1431         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1432                   src_minus11, cmp_minus10, cmp_minus11);
1433         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1434                   cmp_minus11, cmp_minus10, cmp_minus11);
1435         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1436                 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1437 
1438         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1439                   diff_minus11, offset_mask0, offset_mask1);
1440         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1441                   offset_mask0, offset_mask1);
1442         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1443                   src_zero0, offset, dst0);
1444         DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1445                   sao_offset, offset, offset, offset);
1446 
1447         dst0 = __lsx_vxori_b(dst0, 128);
1448         dst0 = __lsx_vsadd_b(dst0, offset);
1449         dst0 = __lsx_vxori_b(dst0, 128);
1450         src_minus10 = src10;
1451         src_minus11 = src11;
1452 
1453         /* load in advance */
1454         DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1455                   src10, src11);
1456 
1457         __lsx_vstelm_d(dst0, dst, 0, 0);
1458         __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1459         dst += dst_stride_2x;
1460     }
1461 
1462     DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1463               src11, src_minus11, src10, src10, src_minus10, src_zero0,
1464               src_minus11, src_zero1);
1465     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1466               cmp_minus10, cmp_minus11);
1467     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1468               diff_minus10, diff_minus11);
1469     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1470               cmp_minus10, cmp_minus11);
1471     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1472               cmp_minus10, cmp_minus11);
1473     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1474               const1, cmp_minus11, diff_minus10, diff_minus11);
1475 
1476     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1477               diff_minus11, offset_mask0, offset_mask1);
1478     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1479               offset_mask0, offset_mask1);
1480     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1481               src_zero0, offset, dst0);
1482     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1483               sao_offset, offset, offset, offset);
1484     dst0 =  __lsx_vxori_b(dst0, 128);
1485     dst0 = __lsx_vsadd_b(dst0, offset);
1486     dst0 = __lsx_vxori_b(dst0, 128);
1487 
1488     __lsx_vstelm_d(dst0, dst, 0, 0);
1489     __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1490 }
1491 
hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t * sao_offset_val, int32_t width, int32_t height)1492 static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst,
1493                                                          int32_t dst_stride,
1494                                                          uint8_t *src,
1495                                                          int32_t src_stride,
1496                                                          int16_t *
1497                                                          sao_offset_val,
1498                                                          int32_t width,
1499                                                          int32_t height)
1500 {
1501     uint8_t *src_orig = src;
1502     uint8_t *dst_orig = dst;
1503     int32_t h_cnt, v_cnt;
1504     const int32_t src_stride_2x = (src_stride << 1);
1505     const int32_t dst_stride_2x = (dst_stride << 1);
1506     const int32_t src_stride_4x = (src_stride << 2);
1507     const int32_t dst_stride_4x = (dst_stride << 2);
1508     const int32_t src_stride_3x = src_stride_2x + src_stride;
1509     const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1510     __m128i edge_idx = {0x403000201, 0x0};
1511     __m128i const1 = __lsx_vldi(1);
1512     __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1513     __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1514     __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1515     __m128i diff_plus13;
1516     __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1517     __m128i src12, dst2, src13, dst3;
1518     __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1519 
1520     sao_offset = __lsx_vld(sao_offset_val, 0);
1521     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1522 
1523     for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1524         src = src_orig + v_cnt;
1525         dst = dst_orig + v_cnt;
1526 
1527         DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
1528                   src_minus10, src_minus11);
1529 
1530         for (h_cnt = (height >> 2); h_cnt--;) {
1531             DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1532                       src, src_stride_3x, src, src_stride_4x,
1533                       src10, src11, src12, src13);
1534             DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
1535                       src10, src10, src_minus11, src10, src11, cmp_minus10,
1536                       cmp_plus10, cmp_minus11, cmp_plus11);
1537             DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
1538                       src12, src13, cmp_minus12, cmp_plus12,
1539                       cmp_minus13, cmp_plus13);
1540             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1541                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1542                       cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1543                       diff_plus11);
1544             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1545                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1546                       cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1547                       diff_plus13);
1548             DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
1549                       src10, src10, src_minus11, src10, src11, cmp_minus10,
1550                       cmp_plus10, cmp_minus11, cmp_plus11);
1551             DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
1552                       src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
1553                       cmp_plus13);
1554             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1555                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1556                       cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1557                       cmp_plus11);
1558             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1559                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1560                       cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1561                       cmp_plus13);
1562             DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1563                       diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1564                       cmp_minus11, diff_plus11, const1, cmp_plus11,
1565                       diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1566             DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1567                       diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1568                       cmp_minus13, diff_plus13, const1, cmp_plus13,
1569                       diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1570 
1571             DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1572                       diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1573                       diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1574                       offset_mask3);
1575             DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1576                       offset_mask2, 2, offset_mask3, 2, offset_mask0,
1577                       offset_mask1, offset_mask2, offset_mask3);
1578             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1579                       sao_offset, sao_offset, offset_mask0,\
1580                       offset_mask0, offset_mask0);
1581             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1582                       sao_offset, sao_offset, offset_mask1, offset_mask1,
1583                       offset_mask1);
1584             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1585                       sao_offset, sao_offset, offset_mask2, offset_mask2,
1586                       offset_mask2);
1587             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1588                       sao_offset, sao_offset, offset_mask3, offset_mask3,
1589                       offset_mask3);
1590 
1591             src_minus10 = src12;
1592             DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
1593                       src12, 128, src_minus11, src10, src11, src12);
1594             DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
1595                       offset_mask1, src11, offset_mask2, src12,
1596                       offset_mask3, dst0, dst1, dst2, dst3);
1597             DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1598                       128, dst0, dst1, dst2, dst3);
1599             src_minus11 = src13;
1600 
1601             __lsx_vst(dst0, dst, 0);
1602             __lsx_vstx(dst1, dst, dst_stride);
1603             __lsx_vstx(dst2, dst, dst_stride_2x);
1604             __lsx_vstx(dst3, dst, dst_stride_3x);
1605             src += src_stride_4x;
1606             dst += dst_stride_4x;
1607         }
1608     }
1609 }
1610 
hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1611 static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst,
1612                                                      int32_t dst_stride,
1613                                                      uint8_t *src,
1614                                                      int32_t src_stride,
1615                                                      int16_t *sao_offset_val,
1616                                                      int32_t height)
1617 {
1618     uint8_t *src_orig;
1619     const int32_t src_stride_2x = (src_stride << 1);
1620     const int32_t dst_stride_2x = (dst_stride << 1);
1621     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1622     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1623     __m128i edge_idx = {0x403000201, 0x0};
1624     __m128i const1 = __lsx_vldi(1);
1625     __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1626     __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1627     __m128i src_minus11, src10, src11;
1628     __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1629     __m128i offset_mask0, offset_mask1;
1630     __m128i zeros = {0};
1631 
1632     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1633     src_orig = src - 1;
1634 
1635     /* load in advance */
1636     DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
1637               src_minus10, src_minus11);
1638     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1639               src10, src11);
1640 
1641     for (height -= 2; height; height -= 2) {
1642         src_orig += src_stride_2x;
1643 
1644         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1645                   shuf1, src_zero0, src_zero1);
1646         DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1647                   src_plus0, src_plus1);
1648 
1649         DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
1650                   src_minus11, src_minus10, src_minus11);
1651         DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
1652                   src_zero1, src_zero0, src_zero1);
1653         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
1654                   src_minus11, cmp_minus10, cmp_minus11);
1655         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1656                   cmp_minus11, diff_minus10, diff_minus11);
1657         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1658                   src_minus11, cmp_minus10, cmp_minus11);
1659         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1660                   cmp_minus11, cmp_minus10, cmp_minus11);
1661         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1662              diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1663 
1664         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1665                   diff_minus11, offset_mask0, offset_mask1);
1666         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1667                   offset_mask0, offset_mask1);
1668         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1669                   src_zero0, offset, dst0);
1670         DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1671                   sao_offset, offset, offset, offset);
1672         dst0 = __lsx_vxori_b(dst0, 128);
1673         dst0 = __lsx_vsadd_b(dst0, offset);
1674         dst0 = __lsx_vxori_b(dst0, 128);
1675 
1676         src_minus10 = src10;
1677         src_minus11 = src11;
1678 
1679         /* load in advance */
1680         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1681                   src10, src11);
1682 
1683         __lsx_vstelm_w(dst0, dst, 0, 0);
1684         __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1685         dst += dst_stride_2x;
1686     }
1687 
1688     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1689               src_zero0, src_zero1);
1690     DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1691               src_plus0, src_plus1);
1692 
1693     DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
1694               src_minus10, src_minus11);
1695     DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1696               src_zero0, src_zero1);
1697     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1698               cmp_minus10, cmp_minus11);
1699     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1700               diff_minus10, diff_minus11);
1701     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1702               cmp_minus10, cmp_minus11);
1703     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1704               cmp_minus10, cmp_minus11);
1705     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1706               const1, cmp_minus11, diff_minus10, diff_minus11);
1707 
1708     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1709               diff_minus11, offset_mask0, offset_mask1);
1710     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1711               offset_mask1);
1712     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1713               src_zero0, offset, dst0);
1714     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1715               sao_offset, offset, offset, offset);
1716     dst0 = __lsx_vxori_b(dst0, 128);
1717     dst0 = __lsx_vsadd_b(dst0, offset);
1718     dst0 = __lsx_vxori_b(dst0, 128);
1719 
1720     __lsx_vstelm_w(dst0, dst, 0, 0);
1721     __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1722 }
1723 
hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1724 static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst,
1725                                                      int32_t dst_stride,
1726                                                      uint8_t *src,
1727                                                      int32_t src_stride,
1728                                                      int16_t *sao_offset_val,
1729                                                      int32_t height)
1730 {
1731     uint8_t *src_orig;
1732     const int32_t src_stride_2x = (src_stride << 1);
1733     const int32_t dst_stride_2x = (dst_stride << 1);
1734     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1735     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1736     __m128i edge_idx = {0x403000201, 0x0};
1737     __m128i const1 = __lsx_vldi(1);
1738     __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1739     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1740     __m128i src_minus10, src10, src_minus11, src11;
1741     __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1742     __m128i offset_mask0, offset_mask1;
1743     __m128i zeros = {0};
1744 
1745     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1746     src_orig = src - 1;
1747 
1748     /* load in advance */
1749     DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
1750               src_minus11);
1751     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1752               src10, src11);
1753 
1754     for (height -= 2; height; height -= 2) {
1755         src_orig += src_stride_2x;
1756 
1757         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1758                   shuf1, src_zero0, src_zero1);
1759         DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1760                   src_plus10, src_plus11);
1761 
1762         DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
1763                   src_minus11, src_minus10, src_minus11);
1764         DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1765                   src_zero0, src_zero1);
1766         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1767                   cmp_minus10, cmp_minus11);
1768         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1769                   cmp_minus11, diff_minus10, diff_minus11);
1770         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1771                   src_minus11, cmp_minus10, cmp_minus11);
1772         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1773                   cmp_minus11, cmp_minus10, cmp_minus11);
1774         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1775                diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
1776 
1777         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1778                   diff_minus11, offset_mask0, offset_mask1);
1779         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1780                   offset_mask0, offset_mask1);
1781         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1782                   src_zero0, offset, dst0);
1783         DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1784                   sao_offset, offset, offset, offset);
1785         dst0 = __lsx_vxori_b(dst0, 128);
1786         dst0 = __lsx_vsadd_b(dst0, offset);
1787         dst0 = __lsx_vxori_b(dst0, 128);
1788 
1789         src_minus10 = src10;
1790         src_minus11 = src11;
1791 
1792         /* load in advance */
1793         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1794                   src10, src11)
1795         __lsx_vstelm_d(dst0, dst, 0, 0);
1796         __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1797         dst += dst_stride_2x;
1798     }
1799 
1800     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1801               src_zero0, src_zero1);
1802     DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1803               src_plus10, src_plus11);
1804     DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
1805               src_minus10, src_minus11);
1806     DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1807               src_zero0, src_zero1);
1808 
1809     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1810               cmp_minus10, cmp_minus11);
1811     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1812               cmp_minus11, diff_minus10, diff_minus11);
1813     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1814               cmp_minus10, cmp_minus11);
1815     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1816               cmp_minus10, cmp_minus11);
1817     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1818               const1, cmp_minus11, diff_minus10, diff_minus11);
1819 
1820     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1821               diff_minus11, offset_mask0, offset_mask1);
1822     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1823               offset_mask1);
1824     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1825               src_zero0, offset, dst0);
1826     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1827               sao_offset, offset, offset, offset);
1828     dst0 = __lsx_vxori_b(dst0, 128);
1829     dst0 = __lsx_vsadd_b(dst0, offset);
1830     dst0 = __lsx_vxori_b(dst0, 128);
1831 
1832     src_minus10 = src10;
1833     src_minus11 = src11;
1834 
1835     /* load in advance */
1836     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1837               src10, src11);
1838 
1839     __lsx_vstelm_d(dst0, dst, 0, 0);
1840     __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1841 }
1842 
hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t * sao_offset_val, int32_t width, int32_t height)1843 static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst,
1844                                                          int32_t dst_stride,
1845                                                          uint8_t *src,
1846                                                          int32_t src_stride,
1847                                                          int16_t *
1848                                                          sao_offset_val,
1849                                                          int32_t width,
1850                                                          int32_t height)
1851 {
1852     uint8_t *src_orig = src;
1853     uint8_t *dst_orig = dst;
1854     int32_t v_cnt;
1855     const int32_t src_stride_2x = (src_stride << 1);
1856     const int32_t dst_stride_2x = (dst_stride << 1);
1857     const int32_t src_stride_4x = (src_stride << 2);
1858     const int32_t dst_stride_4x = (dst_stride << 2);
1859     const int32_t src_stride_3x = src_stride_2x + src_stride;
1860     const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1861 
1862     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1863     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1864     __m128i edge_idx = {0x403000201, 0x0};
1865     __m128i const1 = __lsx_vldi(1);
1866     __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1867     __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1868     __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1869     __m128i diff_plus13, src_minus14, src_plus13;
1870     __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1871     __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1872     __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
1873     __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
1874     __m128i src_zero3, sao_offset, src_plus12;
1875 
1876     sao_offset = __lsx_vld(sao_offset_val, 0);
1877     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1878 
1879     for (; height; height -= 4) {
1880         src_orig = src - 1;
1881         dst_orig = dst;
1882         src_minus11 = __lsx_vld(src_orig, 0);
1883         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1884                   src_minus12, src_minus13);
1885         src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
1886 
1887         for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1888             src_minus10 = __lsx_vld(src_orig - src_stride, 0);
1889             src_orig += 16;
1890             src10 = __lsx_vld(src_orig, 0);
1891             DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
1892                       src_stride_2x, src11, src12);
1893             src13 = __lsx_vldx(src_orig, src_stride_3x);
1894             src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
1895 
1896             DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
1897                       src_minus12, shuf1, src12, src_minus13, shuf1,
1898                       src13, src_minus14, shuf1, src_zero0, src_zero1,
1899                       src_zero2, src_zero3);
1900             DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
1901                       src_minus13, shuf2, src_plus10, src_plus11);
1902             src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
1903 
1904             DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1905                       src_plus10, src_zero1, src_minus11, src_zero1,
1906                       src_plus11, cmp_minus10, cmp_plus10,
1907                       cmp_minus11, cmp_plus11);
1908             DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1909                       src_plus12, src_zero3, src_minus13, src_zero3,
1910                       src_plus13, cmp_minus12, cmp_plus12,
1911                       cmp_minus13, cmp_plus13);
1912             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1913                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1914                       cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1915                       diff_plus11);
1916             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1917                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1918                       cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1919                       diff_plus13);
1920             DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1921                       src_plus10, src_zero1, src_minus11, src_zero1,
1922                       src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1923                       cmp_plus11);
1924             DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1925                       src_plus12, src_zero3, src_minus13, src_zero3,
1926                       src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1927                       cmp_plus13);
1928             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1929                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1930                       cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1931                       cmp_plus11);
1932             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1933                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1934                       cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1935                       cmp_plus13);
1936             DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1937                       diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1938                       cmp_minus11, diff_plus11, const1, cmp_plus11,
1939                       diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1940             DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1941                       diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1942                       cmp_minus13, diff_plus13, const1, cmp_plus13,
1943                       diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1944 
1945             DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1946                       diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1947                       diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1948                       offset_mask3);
1949             DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1950                       offset_mask2, 2, offset_mask3, 2, offset_mask0,
1951                       offset_mask1, offset_mask2, offset_mask3);
1952 
1953             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1954                       sao_offset, sao_offset, offset_mask0, offset_mask0,
1955                       offset_mask0);
1956             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1957                       sao_offset, sao_offset, offset_mask1, offset_mask1,
1958                       offset_mask1);
1959             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1960                       sao_offset, sao_offset, offset_mask2, offset_mask2,
1961                       offset_mask2);
1962             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1963                       sao_offset, sao_offset, offset_mask3, offset_mask3,
1964                       offset_mask3);
1965 
1966             DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
1967                       128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
1968                       src_zero3);
1969             DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1970                       offset_mask1, src_zero2, offset_mask2, src_zero3,
1971                       offset_mask3, dst0, dst1, dst2, dst3);
1972             DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1973                       128, dst0, dst1, dst2, dst3);
1974 
1975             src_minus11 = src10;
1976             src_minus12 = src11;
1977             src_minus13 = src12;
1978             src_minus14 = src13;
1979 
1980             __lsx_vst(dst0, dst_orig, 0);
1981             __lsx_vstx(dst1, dst_orig, dst_stride);
1982             __lsx_vstx(dst2, dst_orig, dst_stride_2x);
1983             __lsx_vstx(dst3, dst_orig, dst_stride_3x);
1984             dst_orig += 16;
1985         }
1986         src += src_stride_4x;
1987         dst += dst_stride_4x;
1988     }
1989 }
1990 
hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1991 static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst,
1992                                                       int32_t dst_stride,
1993                                                       uint8_t *src,
1994                                                       int32_t src_stride,
1995                                                       int16_t *sao_offset_val,
1996                                                       int32_t height)
1997 {
1998     uint8_t *src_orig;
1999     const int32_t src_stride_2x = (src_stride << 1);
2000     const int32_t dst_stride_2x = (dst_stride << 1);
2001 
2002     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2003     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2004     __m128i edge_idx = {0x403000201, 0x0};
2005     __m128i const1 = __lsx_vldi(1);
2006     __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2007     __m128i src_zero0, src_zero1, dst0;
2008     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2009     __m128i src_minus10, src10, src_minus11, src11;
2010     __m128i offset_mask0, offset_mask1;
2011     __m128i zeros = {0};
2012 
2013     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2014     src_orig = src - 1;
2015 
2016     /* load in advance */
2017     DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2018               src_minus10, src_minus11);
2019     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2020               src10, src11);
2021 
2022     for (height -= 2; height; height -= 2) {
2023         src_orig += src_stride_2x;
2024 
2025         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2026                   shuf1, src_zero0, src_zero1);
2027         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2028                   shuf2, src_minus10, src_minus11);
2029 
2030         DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2031                   src_minus10, src_minus11);
2032         DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2033                   src_zero0, src_zero1);
2034         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2035                   cmp_minus10, cmp_minus11);
2036         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2037                   cmp_minus11, diff_minus10, diff_minus11);
2038         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2039                   src_minus11, cmp_minus10, cmp_minus11);
2040         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2041                   cmp_minus11, cmp_minus10, cmp_minus11);
2042         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2043                diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
2044 
2045         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2046                   diff_minus11, offset_mask0, offset_mask1);
2047         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2048                   offset_mask0, offset_mask1);
2049         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2050                   src_zero0, offset, dst0);
2051         DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2052                   sao_offset, offset, offset, offset);
2053         dst0 = __lsx_vxori_b(dst0, 128);
2054         dst0 = __lsx_vsadd_b(dst0, offset);
2055         dst0 = __lsx_vxori_b(dst0, 128);
2056 
2057         src_minus10 = src10;
2058         src_minus11 = src11;
2059 
2060         /* load in advance */
2061         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2062                   src10, src11);
2063 
2064         __lsx_vstelm_w(dst0, dst, 0, 0);
2065         __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2066         dst += dst_stride_2x;
2067     }
2068 
2069     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2070               src_zero0, src_zero1);
2071     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2072               shuf2, src_minus10, src_minus11);
2073 
2074     DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2075               src_minus10, src_minus11);
2076     DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2077               src_zero0, src_zero1);
2078     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2079               cmp_minus10, cmp_minus11);
2080     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2081               cmp_minus11, diff_minus10, diff_minus11);
2082     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2083               cmp_minus10, cmp_minus11);
2084     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2085               cmp_minus10, cmp_minus11);
2086     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2087               const1, cmp_minus11, diff_minus10, diff_minus11);
2088 
2089     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2090               diff_minus11, offset_mask0, offset_mask1);
2091     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2092               offset_mask1);
2093     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2094               src_zero0, offset, dst0);
2095     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2096               sao_offset, offset, offset, offset);
2097     dst0 = __lsx_vxori_b(dst0, 128);
2098     dst0 = __lsx_vsadd_b(dst0, offset);
2099     dst0 = __lsx_vxori_b(dst0, 128);
2100 
2101     __lsx_vstelm_w(dst0, dst, 0, 0);
2102     __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2103     dst += dst_stride_2x;
2104 }
2105 
hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)2106 static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst,
2107                                                       int32_t dst_stride,
2108                                                       uint8_t *src,
2109                                                       int32_t src_stride,
2110                                                       int16_t *sao_offset_val,
2111                                                       int32_t height)
2112 {
2113     uint8_t *src_orig;
2114     const int32_t src_stride_2x = (src_stride << 1);
2115     const int32_t dst_stride_2x = (dst_stride << 1);
2116 
2117     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2118     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2119     __m128i edge_idx = {0x403000201, 0x0};
2120     __m128i const1 = __lsx_vldi(1);
2121     __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2122     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2123     __m128i src_minus10, src10, src_minus11, src11;
2124     __m128i src_zero0, src_zero1, dst0;
2125     __m128i offset_mask0, offset_mask1;
2126     __m128i zeros = {0};
2127 
2128     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2129     src_orig = src - 1;
2130 
2131     /* load in advance */
2132     DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2133               src_minus10, src_minus11);
2134     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2135               src10, src11);
2136 
2137     for (height -= 2; height; height -= 2) {
2138         src_orig += src_stride_2x;
2139 
2140         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2141                   shuf1, src_zero0, src_zero1);
2142         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2143                   shuf2, src_minus10, src_minus11);
2144 
2145         DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2146                   src_minus10, src_minus11);
2147         DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2148                   src_zero0, src_zero1);
2149         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2150                   cmp_minus10, cmp_minus11);
2151         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2152                   cmp_minus11, diff_minus10, diff_minus11);
2153         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2154                   src_minus11, cmp_minus10, cmp_minus11);
2155         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2156                   cmp_minus11, cmp_minus10, cmp_minus11);
2157         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2158               diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
2159 
2160         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2161                   diff_minus11, offset_mask0, offset_mask1);
2162         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2163                   offset_mask0, offset_mask1);
2164         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2165                   src_zero0, offset, dst0);
2166         DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2167                   sao_offset, offset, offset, offset);
2168         dst0 = __lsx_vxori_b(dst0, 128);
2169         dst0 = __lsx_vsadd_b(dst0, offset);
2170         dst0 = __lsx_vxori_b(dst0, 128);
2171 
2172         src_minus10 = src10;
2173         src_minus11 = src11;
2174 
2175         /* load in advance */
2176         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2177                   src10, src11);
2178 
2179         __lsx_vstelm_d(dst0, dst, 0, 0);
2180         __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2181         dst += dst_stride_2x;
2182     }
2183 
2184     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2185               src_zero0, src_zero1);
2186     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2187               shuf2, src_minus10, src_minus11);
2188 
2189     DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2190               src_minus10, src_minus11);
2191     DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2192               src_zero0, src_zero1);
2193     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2194               cmp_minus10, cmp_minus11);
2195     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2196               diff_minus10, diff_minus11);
2197     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2198               cmp_minus10, cmp_minus11);
2199     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2200               cmp_minus10, cmp_minus11);
2201     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2202               const1, cmp_minus11, diff_minus10, diff_minus11);
2203 
2204     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2205               diff_minus11, offset_mask0, offset_mask1);
2206     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2207               offset_mask1);
2208     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2209               src_zero0, offset, dst0);
2210     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2211               sao_offset, offset, offset, offset);
2212     dst0 = __lsx_vxori_b(dst0, 128);
2213     dst0 = __lsx_vsadd_b(dst0, offset);
2214     dst0 = __lsx_vxori_b(dst0, 128);
2215 
2216     __lsx_vstelm_d(dst0, dst, 0, 0);
2217     __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2218 }
2219 
hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)2220 static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst,
2221                                                           int32_t dst_stride,
2222                                                           uint8_t *src,
2223                                                           int32_t src_stride,
2224                                                           int16_t *sao_offset_val,
2225                                                           int32_t width,
2226                                                           int32_t height)
2227 {
2228     uint8_t *src_orig, *dst_orig;
2229     int32_t v_cnt;
2230     const int32_t src_stride_2x = (src_stride << 1);
2231     const int32_t dst_stride_2x = (dst_stride << 1);
2232     const int32_t src_stride_4x = (src_stride << 2);
2233     const int32_t dst_stride_4x = (dst_stride << 2);
2234     const int32_t src_stride_3x = src_stride_2x + src_stride;
2235     const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2236 
2237     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2238     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2239     __m128i edge_idx = {0x403000201, 0x0};
2240     __m128i const1 = __lsx_vldi(1);
2241     __m128i dst0, dst1, dst2, dst3;
2242     __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2243     __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2244     __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2245     __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2246     __m128i src_plus10, src_plus11, src_plus12, src_plus13;
2247     __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2248     __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2249 
2250     sao_offset = __lsx_vld(sao_offset_val, 0);
2251     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2252 
2253     for (; height; height -= 4) {
2254         src_orig = src - 1;
2255         dst_orig = dst;
2256 
2257         src_minus11 = __lsx_vld(src_orig, 0);
2258         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2259                   src_plus10, src_plus11);
2260         src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
2261 
2262         for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2263             src_minus10 = __lsx_vld(src_orig - src_stride, 2);
2264             src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
2265             src_orig += 16;
2266             src10 = __lsx_vld(src_orig, 0);
2267             DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2268                       src11, src12);
2269             src13 =__lsx_vldx(src_orig, src_stride_3x);
2270 
2271             DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
2272                       src_plus10,  shuf1, src12, src_plus11, shuf1, src13,
2273                       src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
2274                       src_zero3);
2275             src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
2276             DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
2277                       src_plus11, shuf2, src_minus12, src_minus13);
2278 
2279             DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
2280                       src_plus10,  src_zero1, src_minus11, src_zero1,
2281                       src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2282                       cmp_plus11);
2283             DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
2284                       src_plus12, src_zero3, src_minus13, src_zero3,
2285                       src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2286                       cmp_plus13);
2287             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2288                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2289                       cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
2290                       diff_plus11);
2291             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2292                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2293                       cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
2294                       diff_plus13);
2295             DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
2296                       src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
2297                       cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
2298             DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
2299                       src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
2300                       cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
2301             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2302                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2303                       cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2304                       cmp_plus11);
2305             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2306                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2307                       cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2308                       cmp_plus13);
2309             DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2310                       diff_plus10, const1, cmp_plus10, diff_minus11, const1,
2311                       cmp_minus11, diff_plus11, const1, cmp_plus11,
2312                       diff_minus10, diff_plus10, diff_minus11, diff_plus11);
2313             DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
2314                       diff_plus12, const1, cmp_plus12, diff_minus13, const1,
2315                       cmp_minus13, diff_plus13, const1, cmp_plus13,
2316                       diff_minus12, diff_plus12, diff_minus13, diff_plus13);
2317 
2318             DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
2319                       diff_plus11, diff_minus12, diff_plus12, diff_minus13,
2320                       diff_plus13, offset_mask0, offset_mask1, offset_mask2,
2321                       offset_mask3);
2322             DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
2323                       offset_mask2, 2, offset_mask3, 2, offset_mask0,
2324                       offset_mask1, offset_mask2, offset_mask3);
2325 
2326             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
2327                       sao_offset, sao_offset, offset_mask0, offset_mask0,
2328                       offset_mask0);
2329             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
2330                       sao_offset, sao_offset, offset_mask1, offset_mask1,
2331                       offset_mask1);
2332             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
2333                       sao_offset, sao_offset, offset_mask2, offset_mask2,
2334                       offset_mask2);
2335             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
2336                       sao_offset, sao_offset, offset_mask3, offset_mask3,
2337                       offset_mask3);
2338 
2339             DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
2340                       src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
2341                       src_zero2, src_zero3);
2342             DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
2343                       offset_mask1, src_zero2, offset_mask2, src_zero3,
2344                       offset_mask3, dst0, dst1, dst2, dst3);
2345             DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
2346                       128, dst0, dst1, dst2, dst3);
2347 
2348             src_minus11 = src10;
2349             src_plus10 = src11;
2350             src_plus11 = src12;
2351             src_plus12 = src13;
2352 
2353             __lsx_vst(dst0, dst_orig, 0);
2354             __lsx_vstx(dst1, dst_orig, dst_stride);
2355             __lsx_vstx(dst2, dst_orig, dst_stride_2x);
2356             __lsx_vstx(dst3, dst_orig, dst_stride_3x);
2357             dst_orig += 16;
2358         }
2359 
2360         src += src_stride_4x;
2361         dst += dst_stride_4x;
2362     }
2363 }
2364 
ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, int16_t *sao_offset_val, int eo, int width, int height)2365 void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
2366                                    ptrdiff_t stride_dst,
2367                                    int16_t *sao_offset_val,
2368                                    int eo, int width, int height)
2369 {
2370     ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
2371 
2372     switch (eo) {
2373     case 0:
2374         if (width >> 4) {
2375             hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst,
2376                                                         src, stride_src,
2377                                                         sao_offset_val,
2378                                                         width - (width & 0x0F),
2379                                                         height);
2380             dst += width & 0xFFFFFFF0;
2381             src += width & 0xFFFFFFF0;
2382             width &= 0x0F;
2383         }
2384 
2385         if (width >> 3) {
2386             hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst,
2387                                                     src, stride_src,
2388                                                     sao_offset_val, height);
2389             dst += 8;
2390             src += 8;
2391             width &= 0x07;
2392         }
2393 
2394         if (width) {
2395             hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst,
2396                                                     src, stride_src,
2397                                                     sao_offset_val, height);
2398         }
2399         break;
2400 
2401     case 1:
2402         if (width >> 4) {
2403             hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst,
2404                                                          src, stride_src,
2405                                                          sao_offset_val,
2406                                                          width - (width & 0x0F),
2407                                                          height);
2408             dst += width & 0xFFFFFFF0;
2409             src += width & 0xFFFFFFF0;
2410             width &= 0x0F;
2411         }
2412 
2413         if (width >> 3) {
2414             hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst,
2415                                                      src, stride_src,
2416                                                      sao_offset_val, height);
2417             dst += 8;
2418             src += 8;
2419             width &= 0x07;
2420         }
2421 
2422         if (width) {
2423             hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst,
2424                                                      src, stride_src,
2425                                                      sao_offset_val, height);
2426         }
2427         break;
2428 
2429     case 2:
2430         if (width >> 4) {
2431             hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst,
2432                                                          src, stride_src,
2433                                                          sao_offset_val,
2434                                                          width - (width & 0x0F),
2435                                                          height);
2436             dst += width & 0xFFFFFFF0;
2437             src += width & 0xFFFFFFF0;
2438             width &= 0x0F;
2439         }
2440 
2441         if (width >> 3) {
2442             hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst,
2443                                                      src, stride_src,
2444                                                      sao_offset_val, height);
2445             dst += 8;
2446             src += 8;
2447             width &= 0x07;
2448         }
2449 
2450         if (width) {
2451             hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst,
2452                                                      src, stride_src,
2453                                                      sao_offset_val, height);
2454         }
2455         break;
2456 
2457     case 3:
2458         if (width >> 4) {
2459             hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst,
2460                                                           src, stride_src,
2461                                                           sao_offset_val,
2462                                                           width - (width & 0x0F),
2463                                                           height);
2464             dst += width & 0xFFFFFFF0;
2465             src += width & 0xFFFFFFF0;
2466             width &= 0x0F;
2467         }
2468 
2469         if (width >> 3) {
2470             hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst,
2471                                                       src, stride_src,
2472                                                       sao_offset_val, height);
2473             dst += 8;
2474             src += 8;
2475             width &= 0x07;
2476         }
2477 
2478         if (width) {
2479             hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst,
2480                                                       src, stride_src,
2481                                                       sao_offset_val, height);
2482         }
2483         break;
2484     }
2485 }
2486