1/*
2 * Copyright (c) 2022 Loongson Technology Corporation Limited
3 * Contributed by Lu Wang <wanglu@loongson.cn>
4 *                Hao Chen <chenhao@loongson.cn>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/loongarch/loongson_intrinsics.h"
24#include "hevcdsp_lsx.h"
25
26void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
27                                      int32_t beta, int32_t *tc,
28                                      uint8_t *p_is_pcm, uint8_t *q_is_pcm)
29{
30    ptrdiff_t stride_2x = (stride << 1);
31    ptrdiff_t stride_4x = (stride << 2);
32    ptrdiff_t stride_3x = stride_2x + stride;
33    uint8_t *p3 = src - stride_4x;
34    uint8_t *p2 = src - stride_3x;
35    uint8_t *p1 = src - stride_2x;
36    uint8_t *p0 = src - stride;
37    uint8_t *q0 = src;
38    uint8_t *q1 = src + stride;
39    uint8_t *q2 = src + stride_2x;
40    uint8_t *q3 = src + stride_3x;
41    uint8_t flag0, flag1;
42    int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
43    int32_t dp04, dq04, dp34, dq34, d04, d34;
44    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
45    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
46
47    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
48    __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
49    __m128i temp0, temp1;
50    __m128i temp2, tc_pos, tc_neg;
51    __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
52    __m128i zero = {0};
53    __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
54
55    dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
56    dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
57    dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
58    dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
59    d00 = dp00 + dq00;
60    d30 = dp30 + dq30;
61    dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
62    dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
63    dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
64    dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
65    d04 = dp04 + dq04;
66    d34 = dp34 + dq34;
67
68    p_is_pcm0 = p_is_pcm[0];
69    p_is_pcm4 = p_is_pcm[1];
70    q_is_pcm0 = q_is_pcm[0];
71    q_is_pcm4 = q_is_pcm[1];
72
73    DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
74    p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
75    p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
76    d0030 = (d00 + d30) >= beta;
77    d0434 = (d04 + d34) >= beta;
78    DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
79    cmp3 = __lsx_vpackev_w(cmp1, cmp0);
80    cmp3 = __lsx_vseqi_w(cmp3, 0);
81
82    if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
83        (!d0030 || !d0434)) {
84        DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
85                  p3_src, p2_src, p1_src, p0_src);
86        DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
87        q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
88        q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
89
90        tc0 = tc[0];
91        beta30 = beta >> 3;
92        beta20 = beta >> 2;
93        tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
94        tc4 = tc[1];
95        tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
96
97        DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
98        DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
99                  p0_src, p3_src, p2_src, p1_src, p0_src);
100        DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
101                  q0_src, q1_src, q2_src, q3_src);
102        flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
103                abs(p0[0] - q0[0]) < tc250;
104        flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
105                abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
106                (d30 << 1) < beta20);
107        tc_pos = __lsx_vpackev_d(cmp1, cmp0);
108        DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
109                  zero, q3_src, q0_src, q1_src, q2_src, q3_src);
110
111        flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
112                abs(p0[4] - q0[4]) < tc254;
113        flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
114                abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
115                (d34 << 1) < beta20);
116        DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
117        cmp2 = __lsx_vpackev_w(cmp1, cmp0);
118        cmp2 = __lsx_vseqi_w(cmp2, 0);
119
120        if (flag0 && flag1) { /* strong only */
121            /* strong filter */
122            tc_pos = __lsx_vslli_h(tc_pos, 1);
123            tc_neg = __lsx_vneg_h(tc_pos);
124
125            /* p part */
126            DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
127                      temp0, temp0);
128            temp1 = __lsx_vadd_h(p3_src, p2_src);
129            temp1 = __lsx_vslli_h(temp1, 1);
130            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
131            temp1 = __lsx_vsrari_h(temp1, 3);
132            temp2 = __lsx_vsub_h(temp1, p2_src);
133            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
134            dst0 = __lsx_vadd_h(temp2, p2_src);
135
136            temp1 = __lsx_vadd_h(temp0, p2_src);
137            temp1 = __lsx_vsrari_h(temp1, 2);
138            temp2 = __lsx_vsub_h(temp1, p1_src);
139            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
140            dst1 = __lsx_vadd_h(temp2, p1_src);
141
142            temp1 = __lsx_vslli_h(temp0, 1);
143            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
144                      temp1, temp1);
145            temp1 = __lsx_vsrari_h(temp1, 3);
146            temp2 = __lsx_vsub_h(temp1, p0_src);
147            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
148            dst2 = __lsx_vadd_h(temp2, p0_src);
149
150            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
151            DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
152                      p1_src, p_is_pcm_vec, dst0, dst1);
153            dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
154
155            /* q part */
156            DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
157                      temp0, temp0);
158            temp1 = __lsx_vadd_h(q3_src, q2_src);
159            temp1 = __lsx_vslli_h(temp1, 1);
160            DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
161            temp1 = __lsx_vsrari_h(temp1, 3);
162            temp2 = __lsx_vsub_h(temp1, q2_src);
163            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
164            dst5 = __lsx_vadd_h(temp2, q2_src);
165
166            temp1 = __lsx_vadd_h(temp0, q2_src);
167            temp1 = __lsx_vsrari_h(temp1, 2);
168            temp2 = __lsx_vsub_h(temp1, q1_src);
169            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
170            dst4 = __lsx_vadd_h(temp2, q1_src);
171
172            temp0 = __lsx_vslli_h(temp0, 1);
173            DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
174                      temp1, temp1);
175            temp1 = __lsx_vsrari_h(temp1, 3);
176            temp2 = __lsx_vsub_h(temp1, q0_src);
177            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
178            dst3 = __lsx_vadd_h(temp2, q0_src);
179
180            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
181            DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
182                      q1_src, q_is_pcm_vec, dst3, dst4);
183            dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
184
185            /* pack results to 8 bit */
186            DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
187            dst2 = __lsx_vpickev_b(dst5, dst4);
188
189            /* pack src to 8 bit */
190            DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
191                      dst3, dst4);
192            dst5 = __lsx_vpickev_b(q2_src, q1_src);
193
194            cmp3 = __lsx_vnor_v(cmp3, cmp3);
195            DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
196                      dst0, dst1);
197            dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
198
199            __lsx_vstelm_d(dst0, p2, 0, 0);
200            __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
201            __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
202            __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
203            __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
204            __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
205            /* strong filter ends */
206        } else if (flag0 == flag1) { /* weak only */
207            /* weak filter */
208            tc_neg = __lsx_vneg_h(tc_pos);
209            DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
210                      diff0, diff1);
211            DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
212                      __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
213            delta0 = __lsx_vsub_h(diff0, diff1);
214            delta0 = __lsx_vsrari_h(delta0, 4);
215            temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
216                                 __lsx_vslli_h(tc_pos, 1));
217            abs_delta0 = __lsx_vadda_h(delta0, zero);
218            abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
219            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
220
221            delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
222            temp2 = __lsx_vadd_h(delta0, p0_src);
223            temp2 = __lsx_vclip255_h(temp2);
224            temp0 = __lsx_vbitsel_v(temp2, p0_src,
225                                    __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
226            temp2 = __lsx_vsub_h(q0_src, delta0);
227            temp2 = __lsx_vclip255_h(temp2);
228            temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
229                                    q_is_pcm_vec));
230            DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
231                      q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
232
233            tmp = (beta + (beta >> 1)) >> 3;
234            DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
235                      cmp0, cmp1);
236            cmp0 = __lsx_vpackev_d(cmp1, cmp0);
237            cmp0 = __lsx_vseqi_d(cmp0, 0);
238            p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
239
240            DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
241                      cmp0, cmp1);
242            cmp0 = __lsx_vpackev_d(cmp1, cmp0);
243            cmp0 = __lsx_vseqi_d(cmp0, 0);
244            q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
245            tc_pos = __lsx_vsrai_h(tc_pos, 1);
246            tc_neg = __lsx_vneg_h(tc_pos);
247
248            DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
249                      delta1, delta2);
250            DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
251                      delta1, delta2);
252            delta1 = __lsx_vadd_h(delta1, delta0);
253            delta2 = __lsx_vsub_h(delta2, delta0);
254            DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
255            DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
256                      tc_neg, tc_pos, delta1, delta2);
257            DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
258                      delta1, delta2);
259            DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
260            DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
261                      q1_src, q_is_pcm_vec, delta1, delta2);
262
263            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
264            DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
265                      p0_src,  abs_delta0, temp2, q0_src, abs_delta0, delta2,
266                      q1_src, abs_delta0, dst1, dst2, dst3, dst4);
267            /* pack results to 8 bit */
268            DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
269            /* pack src to 8 bit */
270            DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
271                      dst2, dst3);
272            cmp3 = __lsx_vnor_v(cmp3, cmp3);
273            DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
274                      dst0, dst1);
275
276            p2 += stride;
277            __lsx_vstelm_d(dst0, p2, 0, 0);
278            __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
279            __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
280            __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
281            /* weak filter ends */
282        } else { /* strong + weak */
283            /* strong filter */
284            tc_pos = __lsx_vslli_h(tc_pos, 1);
285            tc_neg = __lsx_vneg_h(tc_pos);
286
287            /* p part */
288            DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
289                      temp0, temp0);
290            temp1 = __lsx_vadd_h(p3_src, p2_src);
291            temp1 = __lsx_vslli_h(temp1, 1);
292            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
293            temp1 = __lsx_vsrari_h(temp1, 3);
294            temp2 = __lsx_vsub_h(temp1, p2_src);
295            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
296            dst0 = __lsx_vadd_h(temp2, p2_src);
297
298            temp1 = __lsx_vadd_h(temp0, p2_src);
299            temp1 = __lsx_vsrari_h(temp1, 2);
300            temp2 = __lsx_vsub_h(temp1, p1_src);
301            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
302            dst1 = __lsx_vadd_h(temp2, p1_src);
303
304            temp1 = __lsx_vslli_h(temp0, 1);
305            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
306            temp1 = __lsx_vsrari_h(temp1, 3);
307            temp2 = __lsx_vsub_h(temp1, p0_src);
308            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
309            dst2 = __lsx_vadd_h(temp2, p0_src);
310
311            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
312            DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
313                      p1_src, p_is_pcm_vec, dst0, dst1);
314            dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
315
316            /* q part */
317            DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
318                      temp0, temp0);
319            temp1 = __lsx_vadd_h(q3_src, q2_src);
320            temp1 = __lsx_vslli_h(temp1, 1);
321            DUP2_ARG2(__lsx_vadd_h, temp1,  q2_src, temp1, temp0, temp1, temp1);
322            temp1 = __lsx_vsrari_h(temp1, 3);
323            temp2 = __lsx_vsub_h(temp1, q2_src);
324            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
325            dst5 = __lsx_vadd_h(temp2, q2_src);
326
327            temp1 = __lsx_vadd_h(temp0, q2_src);
328            temp1 = __lsx_vsrari_h(temp1, 2);
329            temp2 = __lsx_vsub_h(temp1, q1_src);
330            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
331            dst4 = __lsx_vadd_h(temp2, q1_src);
332
333            temp1 = __lsx_vslli_h(temp0, 1);
334            DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
335            temp1 = __lsx_vsrari_h(temp1, 3);
336            temp2 = __lsx_vsub_h(temp1, q0_src);
337            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
338            dst3 = __lsx_vadd_h(temp2, q0_src);
339
340            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
341            DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
342                      q1_src, q_is_pcm_vec, dst3, dst4);
343            dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
344
345            /* pack strong results to 8 bit */
346            DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
347            dst2 = __lsx_vpickev_b(dst5, dst4);
348            /* strong filter ends */
349
350            /* weak filter */
351            tc_pos = __lsx_vsrai_h(tc_pos, 1);
352            tc_neg = __lsx_vneg_h(tc_pos);
353
354            DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
355                      diff0, diff1);
356            DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
357                      __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
358            delta0 = __lsx_vsub_h(diff0, diff1);
359            delta0 = __lsx_vsrari_h(delta0, 4);
360            temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
361                                 __lsx_vslli_h(tc_pos, 1));
362            abs_delta0 = __lsx_vadda_h(delta0, zero);
363            abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
364            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
365
366            delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
367            temp2 = __lsx_vadd_h(delta0, p0_src);
368            temp2 = __lsx_vclip255_h(temp2);
369            temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
370
371            temp2 = __lsx_vsub_h(q0_src, delta0);
372            temp2 = __lsx_vclip255_h(temp2);
373            temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
374
375            tmp = (beta + (beta >> 1)) >> 3;
376            DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
377                      cmp0, cmp1);
378            cmp0 = __lsx_vpackev_d(cmp1, cmp0);
379            p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
380            DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
381                      cmp0, cmp1);
382            cmp0 = __lsx_vpackev_d(cmp1, cmp0);
383            q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
384
385            tc_pos = __lsx_vsrai_h(tc_pos, 1);
386            tc_neg = __lsx_vneg_h(tc_pos);
387
388            DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
389                      delta1, delta2);
390            DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
391                      delta1, delta2);
392            delta1 = __lsx_vadd_h(delta1, delta0);
393            delta2 = __lsx_vsub_h(delta2, delta0);
394            DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
395            DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
396                      tc_pos, delta1, delta2);
397            DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
398                      delta1, delta2);
399            DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
400            DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
401                      q1_src, q_is_pcm_vec, delta1, delta2);
402            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
403            DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
404                      q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
405                      q0_src, abs_delta0, delta1, delta2, temp0, temp2);
406            /* weak filter ends */
407
408            /* pack weak results to 8 bit */
409            DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
410                      dst3, dst4);
411            dst5 = __lsx_vpickev_b(q2_src, delta2);
412
413            /* select between weak or strong */
414            DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
415                      dst0, dst1);
416            dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
417
418            /* pack src to 8 bit */
419            DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
420                      dst3, dst4);
421            dst5 = __lsx_vpickev_b(q2_src, q1_src);
422
423            cmp3 = __lsx_vnor_v(cmp3, cmp3);
424            DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
425                      dst0, dst1);
426            dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
427
428            __lsx_vstelm_d(dst0, p2, 0, 0);
429            __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
430            __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
431            __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
432            __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
433            __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
434        }
435    }
436}
437
438void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
439                                      int32_t beta, int32_t *tc,
440                                      uint8_t *p_is_pcm, uint8_t *q_is_pcm)
441{
442    ptrdiff_t stride_2x = (stride << 1);
443    ptrdiff_t stride_4x = (stride << 2);
444    ptrdiff_t stride_3x = stride_2x + stride;
445    uint8_t *p3 = src;
446    uint8_t *p2 = src + stride_3x;
447    uint8_t *p1 = src + stride_4x;
448    uint8_t *p0 = src + stride_4x + stride_3x;
449    uint8_t flag0, flag1;
450    int32_t dp00, dq00, dp30, dq30, d00, d30;
451    int32_t d0030, d0434;
452    int32_t dp04, dq04, dp34, dq34, d04, d34;
453    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
454    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
455
456    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
457    __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
458    __m128i cmp3;
459    __m128i temp0, temp1;
460    __m128i temp2;
461    __m128i tc_pos, tc_neg;
462    __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
463    __m128i zero = {0};
464    __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
465
466    dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
467    dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
468    dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
469    dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
470    d00 = dp00 + dq00;
471    d30 = dp30 + dq30;
472    p_is_pcm0 = p_is_pcm[0];
473    q_is_pcm0 = q_is_pcm[0];
474
475    dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
476    dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
477    dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
478    dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
479    d04 = dp04 + dq04;
480    d34 = dp34 + dq34;
481    p_is_pcm4 = p_is_pcm[1];
482    q_is_pcm4 = q_is_pcm[1];
483
484    DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
485    p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
486    p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
487
488    d0030 = (d00 + d30) >= beta;
489    d0434 = (d04 + d34) >= beta;
490
491    DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
492    cmp3 = __lsx_vpackev_d(cmp1, cmp0);
493    cmp3 = __lsx_vseqi_d(cmp3, 0);
494
495    if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
496        (!d0030 || !d0434)) {
497        src -= 4;
498        DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
499                  src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
500        src += stride_4x;
501        DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
502                  src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
503        src -= stride_4x;
504
505        DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
506        q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
507        q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
508
509        tc0 = tc[0];
510        beta30 = beta >> 3;
511        beta20 = beta >> 2;
512        tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
513        tc4 = tc[1];
514        tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
515        DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
516        tc_pos = __lsx_vpackev_d(cmp1, cmp0);
517        LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
518                           q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
519                           q0_src, q1_src, q2_src, q3_src);
520
521        flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
522                abs(p3[-1] - p3[0]) < tc250;
523        flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
524                abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
525                (d30 << 1) < beta20);
526        cmp0 = __lsx_vreplgr2vr_d(flag0);
527        DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
528                  p0_src, p3_src, p2_src, p1_src, p0_src);
529
530        flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
531                abs(p1[-1] - p1[0]) < tc254;
532        flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
533                abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
534                (d34 << 1) < beta20);
535        DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
536                  q3_src, q0_src, q1_src, q2_src, q3_src);
537
538        cmp1 = __lsx_vreplgr2vr_d(flag1);
539        cmp2 = __lsx_vpackev_d(cmp1, cmp0);
540        cmp2 = __lsx_vseqi_d(cmp2, 0);
541
542        if (flag0 && flag1) { /* strong only */
543            /* strong filter */
544            tc_neg = __lsx_vneg_h(tc_pos);
545            /* p part */
546            DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
547                      temp0, temp0);
548            temp1 = __lsx_vadd_h(p3_src, p2_src);
549            temp1 = __lsx_vslli_h(temp1, 1);
550            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
551            temp1 = __lsx_vsrari_h(temp1, 3);
552            temp2 = __lsx_vsub_h(temp1, p2_src);
553            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
554            dst0 = __lsx_vadd_h(temp2, p2_src);
555
556            temp1 = __lsx_vadd_h(temp0, p2_src);
557            temp1 = __lsx_vsrari_h(temp1, 2);
558            temp2 = __lsx_vsub_h(temp1, p1_src);
559            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
560            dst1 = __lsx_vadd_h(temp2, p1_src);
561
562            temp1 = __lsx_vslli_h(temp0, 1);
563            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
564            temp1 = __lsx_vsrari_h(temp1, 3);
565            temp2 = __lsx_vsub_h(temp1, p0_src);
566            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
567            dst2 = __lsx_vadd_h(temp2, p0_src);
568
569            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
570            DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
571                      p_is_pcm_vec, dst0, dst1);
572            dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
573
574            /* q part */
575            DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
576                      temp0, temp0);
577            temp1 = __lsx_vadd_h(q3_src, q2_src);
578            temp1 = __lsx_vslli_h(temp1, 1);
579            DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
580            temp1 = __lsx_vsrari_h(temp1, 3);
581            temp2 = __lsx_vsub_h(temp1, q2_src);
582            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
583            dst5 = __lsx_vadd_h(temp2, q2_src);
584
585            temp1 = __lsx_vadd_h(temp0, q2_src);
586            temp1 = __lsx_vsrari_h(temp1, 2);
587            temp2 = __lsx_vsub_h(temp1, q1_src);
588            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
589            dst4 = __lsx_vadd_h(temp2, q1_src);
590
591            temp1 = __lsx_vslli_h(temp0, 1);
592            DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
593            temp1 = __lsx_vsrari_h(temp1, 3);
594            temp2 = __lsx_vsub_h(temp1, q0_src);
595            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
596            dst3 = __lsx_vadd_h(temp2, q0_src);
597
598            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
599            DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
600                      q_is_pcm_vec, dst3, dst4);
601            dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
602            /* strong filter ends */
603        } else if (flag0 == flag1) { /* weak only */
604            /* weak filter */
605            tc_pos = __lsx_vsrai_h(tc_pos, 1);
606            tc_neg = __lsx_vneg_h(tc_pos);
607
608            DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
609                      diff0, diff1);
610            DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
611                      __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
612            delta0 = __lsx_vsub_h(diff0, diff1);
613            delta0 = __lsx_vsrari_h(delta0, 4);
614            temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
615                                 __lsx_vslli_h(tc_pos, 1));
616            abs_delta0 = __lsx_vadda_h(delta0, zero);
617            abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
618            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
619
620            delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
621            temp2 = __lsx_vadd_h(delta0, p0_src);
622            temp2 = __lsx_vclip255_h(temp2);
623            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
624            temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
625
626            temp2 = __lsx_vsub_h(q0_src, delta0);
627            temp2 = __lsx_vclip255_h(temp2);
628            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
629            temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
630
631            tmp = ((beta + (beta >> 1)) >> 3);
632            DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
633                      !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
634            p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
635            p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
636
637            DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
638                      (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
639            q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
640            q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
641            tc_pos = __lsx_vsrai_h(tc_pos, 1);
642            tc_neg = __lsx_vneg_h(tc_pos);
643
644            DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
645                      delta1, delta2);
646            DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
647                      delta1, delta2);
648            delta1 = __lsx_vadd_h(delta1, delta0);
649            delta2 = __lsx_vsub_h(delta2, delta0);
650            DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
651            DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
652                      tc_pos, delta1, delta2);
653            DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
654                      delta1, delta2);
655            DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
656            DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
657                      q1_src, q_is_pcm_vec, delta1, delta2);
658
659            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
660            DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
661                      p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
662                      q1_src, abs_delta0, dst0, dst1, dst2, dst3);
663            /* weak filter ends */
664
665            cmp3 = __lsx_vnor_v(cmp3, cmp3);
666            DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
667                      cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
668                      dst0, dst1, dst2, dst3);
669            DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
670
671            /* transpose */
672            dst4 = __lsx_vilvl_b(dst1, dst0);
673            dst5 = __lsx_vilvh_b(dst1, dst0);
674            dst0 = __lsx_vilvl_h(dst5, dst4);
675            dst1 = __lsx_vilvh_h(dst5, dst4);
676
677            src += 2;
678            __lsx_vstelm_w(dst0, src, 0, 0);
679            __lsx_vstelm_w(dst0, src + stride, 0, 1);
680            __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
681            __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
682            src += stride_4x;
683            __lsx_vstelm_w(dst1, src, 0, 0);
684            __lsx_vstelm_w(dst1, src + stride, 0, 1);
685            __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
686            __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
687            return;
688        } else { /* strong + weak */
689            /* strong filter */
690            tc_neg = __lsx_vneg_h(tc_pos);
691
692            /* p part */
693            DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
694                      temp0, temp0);
695
696            temp1 = __lsx_vadd_h(p3_src, p2_src);
697            temp1 = __lsx_vslli_h(temp1, 1);
698            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
699            temp1 = __lsx_vsrari_h(temp1, 3);
700            temp2 = __lsx_vsub_h(temp1, p2_src);
701            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
702            dst0 = __lsx_vadd_h(temp2, p2_src);
703
704            temp1 = __lsx_vadd_h(temp0, p2_src);
705            temp1 = __lsx_vsrari_h(temp1, 2);
706            temp2 = __lsx_vsub_h(temp1, p1_src);
707            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
708            dst1 = __lsx_vadd_h(temp2, p1_src);
709
710            temp1 = __lsx_vslli_h(temp0, 1);
711            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
712            temp1 = __lsx_vsrari_h(temp1, 3);
713            temp2 = __lsx_vsub_h(temp1, p0_src);
714            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
715            dst2 = __lsx_vadd_h(temp2, p0_src);
716
717            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
718            DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
719                      p_is_pcm_vec, dst0, dst1);
720            dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
721
722            /* q part */
723            DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
724            temp1 = __lsx_vadd_h(q3_src, q2_src);
725            temp1 = __lsx_vslli_h(temp1, 1);
726            DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
727            temp1 = __lsx_vsrari_h(temp1, 3);
728            temp2 = __lsx_vsub_h(temp1, q2_src);
729            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
730            dst5 = __lsx_vadd_h(temp2, q2_src);
731
732            temp1 = __lsx_vadd_h(temp0, q2_src);
733            temp1 = __lsx_vsrari_h(temp1, 2);
734            temp2 = __lsx_vsub_h(temp1, q1_src);
735            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
736            dst4 = __lsx_vadd_h(temp2, q1_src);
737
738            temp1 = __lsx_vslli_h(temp0, 1);
739            DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
740            temp1 = __lsx_vsrari_h(temp1, 3);
741            temp2 = __lsx_vsub_h(temp1, q0_src);
742            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
743            dst3 = __lsx_vadd_h(temp2, q0_src);
744
745            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
746            DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
747                      q_is_pcm_vec, dst3, dst4);
748            dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
749            /* strong filter ends */
750
751            /* weak filter */
752            tc_pos = __lsx_vsrai_h(tc_pos, 1);
753            tc_neg = __lsx_vneg_h(tc_pos);
754
755            DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
756                      diff0, diff1);
757            DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
758                      __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
759            delta0 = __lsx_vsub_h(diff0, diff1);
760            delta0 = __lsx_vsrari_h(delta0, 4);
761
762            temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
763                    __lsx_vslli_h(tc_pos, 1));
764            abs_delta0 = __lsx_vadda_h(delta0, zero);
765            abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
766            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
767            delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
768            temp2 = __lsx_vadd_h(delta0, p0_src);
769            temp2 = __lsx_vclip255_h(temp2);
770            temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
771            temp2 = __lsx_vsub_h(q0_src, delta0);
772            temp2 = __lsx_vclip255_h(temp2);
773            temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
774
775            tmp = (beta + (beta >> 1)) >> 3;
776            DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
777                      !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
778            p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
779            p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
780
781            DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
782                      (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
783            q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
784            q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
785            tc_pos = __lsx_vsrai_h(tc_pos, 1);
786            tc_neg = __lsx_vneg_h(tc_pos);
787
788            DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
789                      delta1, delta2);
790            DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
791                      delta1, delta2);
792            delta1 = __lsx_vadd_h(delta1, delta0);
793            delta2 = __lsx_vsub_h(delta2, delta0);
794            DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
795            DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
796                      tc_pos, delta1, delta2);
797            DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
798                      delta1, delta2);
799            DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
800            DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
801                      q1_src, q_is_pcm_vec, delta1, delta2);
802
803            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
804            DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
805                      q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
806                      q0_src, abs_delta0, delta1, delta2, temp0, temp2);
807            /* weak filter ends*/
808
809            /* select between weak or strong */
810            DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
811                      cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
812                      dst0, dst1, dst2, dst3);
813            DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
814                      dst4, dst5);
815        }
816
817        cmp3 = __lsx_vnor_v(cmp3, cmp3);
818        DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
819                  p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
820        DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
821                  dst4, dst5);
822
823        /* pack results to 8 bit */
824        DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
825                  dst5, dst0, dst1, dst2, dst3);
826
827        /* transpose */
828        DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
829        DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
830        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
831        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
832
833        src += 1;
834        __lsx_vstelm_w(dst0, src, 0, 0);
835        __lsx_vstelm_h(dst2, src, 4, 0);
836        src += stride;
837        __lsx_vstelm_w(dst0, src, 0, 1);
838        __lsx_vstelm_h(dst2, src, 4, 2);
839        src += stride;
840
841        __lsx_vstelm_w(dst0, src, 0, 2);
842        __lsx_vstelm_h(dst2, src, 4, 4);
843        src += stride;
844        __lsx_vstelm_w(dst0, src, 0, 3);
845        __lsx_vstelm_h(dst2, src, 4, 6);
846        src += stride;
847
848        __lsx_vstelm_w(dst1, src, 0, 0);
849        __lsx_vstelm_h(dst3, src, 4, 0);
850        src += stride;
851        __lsx_vstelm_w(dst1, src, 0, 1);
852        __lsx_vstelm_h(dst3, src, 4, 2);
853        src += stride;
854
855        __lsx_vstelm_w(dst1, src, 0, 2);
856        __lsx_vstelm_h(dst3, src, 4, 4);
857        src += stride;
858        __lsx_vstelm_w(dst1, src, 0, 3);
859        __lsx_vstelm_h(dst3, src, 4, 6);
860    }
861}
862
863void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
864                                        int32_t *tc, uint8_t *p_is_pcm,
865                                        uint8_t *q_is_pcm)
866{
867    uint8_t *p1_ptr = src - (stride << 1);
868    uint8_t *p0_ptr = src - stride;
869    uint8_t *q0_ptr = src;
870    uint8_t *q1_ptr = src + stride;
871    __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
872    __m128i p1, p0, q0, q1;
873    __m128i tc_pos, tc_neg;
874    __m128i zero = {0};
875    __m128i temp0, temp1, delta;
876
877    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
878        DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
879        tc_pos = __lsx_vpackev_d(cmp1, cmp0);
880        tc_neg = __lsx_vneg_h(tc_pos);
881        DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
882        p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
883        p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
884
885        DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
886        q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
887        q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
888
889        DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
890                  p1, p0, q0, q1);
891        DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
892                  p1, p0, q0, q1);
893        DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
894        temp0 = __lsx_vslli_h(temp0, 2);
895        temp0 = __lsx_vadd_h(temp0, temp1);
896        delta = __lsx_vsrari_h(temp0, 3);
897        delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
898        temp0 = __lsx_vadd_h(p0, delta);
899        temp0 = __lsx_vclip255_h(temp0);
900        p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
901        temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
902
903        temp1 = __lsx_vsub_h(q0, delta);
904        temp1 = __lsx_vclip255_h(temp1);
905        q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
906        temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
907
908        tc_pos = __lsx_vslei_d(tc_pos, 0);
909        DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
910                  temp0, temp1);
911        temp0 = __lsx_vpickev_b(temp1, temp0);
912        __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
913        __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
914    }
915}
916
917void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
918                                        int32_t *tc, uint8_t *p_is_pcm,
919                                        uint8_t *q_is_pcm)
920{
921    ptrdiff_t stride_2x = (stride << 1);
922    ptrdiff_t stride_4x = (stride << 2);
923    ptrdiff_t stride_3x = stride_2x + stride;
924    __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
926    __m128i p1, p0, q0, q1;
927    __m128i tc_pos, tc_neg;
928    __m128i zero = {0};
929    __m128i temp0, temp1, delta;
930
931    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
932        DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
933        tc_pos = __lsx_vpackev_d(cmp1, cmp0);
934        tc_neg = __lsx_vneg_h(tc_pos);
935
936        DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
937        p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
938        p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
939        DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
940        q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
941        q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
942
943        src -= 2;
944        DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
945                  src + stride_3x, 0, src0, src1, src2, src3);
946        src += stride_4x;
947        DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
948                  src + stride_3x, 0, src4, src5, src6, src7);
949        src -= stride_4x;
950        LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
951                           p1, p0, q0, q1);
952        DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
953                  p1, p0, q0, q1);
954
955        DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
956        temp0 = __lsx_vslli_h(temp0, 2);
957        temp0 = __lsx_vadd_h(temp0, temp1);
958        delta = __lsx_vsrari_h(temp0, 3);
959        delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
960
961        temp0 = __lsx_vadd_h(p0, delta);
962        temp1 = __lsx_vsub_h(q0, delta);
963        DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
964        DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
965                  q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
966        DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
967                  q_is_pcm_vec, temp0, temp1);
968
969        tc_pos = __lsx_vslei_d(tc_pos, 0);
970        DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
971                  temp0, temp1);
972        temp0 = __lsx_vpackev_b(temp1, temp0);
973
974        src += 1;
975        __lsx_vstelm_h(temp0, src, 0, 0);
976        __lsx_vstelm_h(temp0, src + stride, 0, 1);
977        __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
978        __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
979        src += stride_4x;
980        __lsx_vstelm_h(temp0, src, 0, 4);
981        __lsx_vstelm_h(temp0, src + stride, 0, 5);
982        __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
983        __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
984        src -= stride_4x;
985    }
986}
987
988static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst,
989                                                    int32_t dst_stride,
990                                                    uint8_t *src,
991                                                    int32_t src_stride,
992                                                    int16_t *sao_offset_val,
993                                                    int32_t height)
994{
995    const int32_t src_stride_2x = (src_stride << 1);
996    const int32_t dst_stride_2x = (dst_stride << 1);
997    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
998    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
999    __m128i edge_idx = {0x403000201, 0x0};
1000    __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1001    __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1002    __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
1003    __m128i const1 = __lsx_vldi(1);
1004    __m128i zero = {0};
1005
1006    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1007    src -= 1;
1008
1009    /* load in advance */
1010    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1011
1012    for (height -= 2; height; height -= 2) {
1013        src += src_stride_2x;
1014        src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1015        src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1016        src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1017
1018        DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1019                  cmp_minus10, cmp_minus11);
1020        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1021                  cmp_minus11, diff_minus10, diff_minus11);
1022        DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1023                  cmp_minus10, cmp_minus11);
1024        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1025                  cmp_minus11, cmp_minus10, cmp_minus11);
1026        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1027        diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1028
1029        offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1030        offset = __lsx_vaddi_bu(offset, 2);
1031
1032        /* load in advance */
1033        DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1034                  src_minus10, src_minus11);
1035        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
1036                  sao_offset, sao_offset, offset, offset, offset);
1037        src0 = __lsx_vxori_b(src0, 128);
1038        dst0 = __lsx_vsadd_b(src0, offset);
1039        dst0 = __lsx_vxori_b(dst0, 128);
1040
1041        __lsx_vstelm_w(dst0, dst, 0, 0);
1042        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1043        dst += dst_stride_2x;
1044    }
1045
1046    src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1047    src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1048    src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1049
1050    DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1051              cmp_minus11);
1052    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1053              diff_minus10, diff_minus11);
1054    DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1055              cmp_minus11);
1056    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1057              cmp_minus10, cmp_minus11);
1058    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1059              const1, cmp_minus11, diff_minus10, diff_minus11);
1060
1061    offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1062    offset = __lsx_vaddi_bu(offset, 2);
1063    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
1064              offset, offset, offset);
1065    src0 = __lsx_vxori_b(src0, 128);
1066    dst0 = __lsx_vsadd_b(src0, offset);
1067    dst0 = __lsx_vxori_b(dst0, 128);
1068
1069    __lsx_vstelm_w(dst0, dst, 0, 0);
1070    __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1071}
1072
1073static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst,
1074                                                    int32_t dst_stride,
1075                                                    uint8_t *src,
1076                                                    int32_t src_stride,
1077                                                    int16_t *sao_offset_val,
1078                                                    int32_t height)
1079{
1080    const int32_t src_stride_2x = (src_stride << 1);
1081    const int32_t dst_stride_2x = (dst_stride << 1);
1082    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1083    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1084    __m128i edge_idx = {0x403000201, 0x0};
1085    __m128i const1 = __lsx_vldi(1);
1086    __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1087    __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1088    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1089    __m128i zeros = {0};
1090
1091    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1092    src -= 1;
1093
1094    /* load in advance */
1095    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1096
1097    for (height -= 2; height; height -= 2) {
1098        src += src_stride_2x;
1099        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
1100                  src_minus11, shuf1, src0, src1);
1101        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
1102                  src_minus11, shuf2, src_plus10, src_plus11);
1103        DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1104                  src_plus10, src_minus10, src_plus10);
1105        src0 = __lsx_vpickev_d(src1, src0);
1106
1107        DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1108                  cmp_minus10, cmp_minus11);
1109        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1110                  cmp_minus11, diff_minus10, diff_minus11);
1111        DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1112                  cmp_minus10, cmp_minus11);
1113        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1114                  cmp_minus11, cmp_minus10, cmp_minus11);
1115        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1116        diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1117
1118        offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1119        offset = __lsx_vaddi_bu(offset, 2);
1120
1121        /* load in advance */
1122        DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1123                  src_minus10, src_minus11);
1124        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1125                  sao_offset, offset, offset, offset);
1126        src0 = __lsx_vxori_b(src0, 128);
1127        dst0 = __lsx_vsadd_b(src0, offset);
1128        dst0 = __lsx_vxori_b(dst0, 128);
1129
1130        __lsx_vstelm_d(dst0, dst, 0, 0);
1131        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1132        dst += dst_stride_2x;
1133    }
1134
1135    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
1136              shuf1, src0, src1);
1137    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
1138              shuf2, src_plus10, src_plus11);
1139    DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1140              src_plus10, src_minus10, src_plus10);
1141    src0 =  __lsx_vpickev_d(src1, src0);
1142
1143    DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1144              cmp_minus11);
1145    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1146              diff_minus10, diff_minus11);
1147    DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1148              cmp_minus11);
1149    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1150              cmp_minus10, cmp_minus11);
1151    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1152              const1, cmp_minus11, diff_minus10, diff_minus11);
1153
1154    offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1155    offset = __lsx_vaddi_bu(offset, 2);
1156    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1157              sao_offset, offset, offset, offset);
1158    src0 = __lsx_vxori_b(src0, 128);
1159    dst0 = __lsx_vsadd_b(src0, offset);
1160    dst0 = __lsx_vxori_b(dst0, 128);
1161
1162    __lsx_vstelm_d(dst0, dst, 0, 0);
1163    __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1164}
1165
1166static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst,
1167                                                        int32_t dst_stride,
1168                                                        uint8_t *src,
1169                                                        int32_t src_stride,
1170                                                        int16_t *sao_offset_val,
1171                                                        int32_t width,
1172                                                        int32_t height)
1173{
1174    uint8_t *dst_ptr, *src_minus1;
1175    int32_t v_cnt;
1176    const int32_t src_stride_2x = (src_stride << 1);
1177    const int32_t dst_stride_2x = (dst_stride << 1);
1178    const int32_t src_stride_4x = (src_stride << 2);
1179    const int32_t dst_stride_4x = (dst_stride << 2);
1180    const int32_t src_stride_3x = src_stride_2x + src_stride;
1181    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1182
1183    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1184    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1185    __m128i edge_idx = {0x403000201, 0x0};
1186    __m128i const1 = __lsx_vldi(1);
1187    __m128i sao_offset;
1188    __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1189    __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1190    __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1191    __m128i diff_plus13;
1192    __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1193    __m128i src_minus10, src_minus11, src_minus12, src_minus13;
1194    __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1195    __m128i src_zero0, src_zero1, src_zero2, src_zero3;
1196    __m128i src_plus10, src_plus11, src_plus12, src_plus13;
1197
1198    sao_offset = __lsx_vld(sao_offset_val, 0);
1199    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1200
1201    for (; height; height -= 4) {
1202        src_minus1 = src - 1;
1203        src_minus10 = __lsx_vld(src_minus1, 0);
1204        DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1205                  src_stride_2x, src_minus11, src_minus12);
1206        src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
1207
1208        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1209            src_minus1 += 16;
1210            dst_ptr = dst + v_cnt;
1211            src10 = __lsx_vld(src_minus1, 0);
1212            DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1213                      src_stride_2x, src11, src12);
1214            src13 = __lsx_vldx(src_minus1, src_stride_3x);
1215            DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
1216                      src_minus11, shuf1, src12, src_minus12, shuf1, src13,
1217                      src_minus13, shuf1, src_zero0, src_zero1,
1218                      src_zero2, src_zero3);
1219            DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
1220                      src_minus11, shuf2, src12, src_minus12, shuf2, src13,
1221                      src_minus13, shuf2, src_plus10, src_plus11,
1222                      src_plus12, src_plus13);
1223            DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1224                      src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1225                      cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1226            DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1227                      src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1228                      cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1229            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1230                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1231                      cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1232                      diff_plus11);
1233            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1234                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1235                      cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1236                      diff_plus13);
1237            DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1238                      src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1239                      cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1240            DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1241                      src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1242                      cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1243            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1244                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1245                      cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1246                      cmp_plus11);
1247            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1248                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1249                      cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1250                      cmp_plus13);
1251            DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1252                      diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1253                      cmp_minus11, diff_plus11, const1, cmp_plus11,
1254                      diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1255            DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1256                      diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1257                      cmp_minus13, diff_plus13, const1, cmp_plus13,
1258                      diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1259
1260            DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1261                      diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1262                      diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1263                      offset_mask3);
1264            DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1265                      offset_mask2, 2, offset_mask3, 2, offset_mask0,
1266                      offset_mask1, offset_mask2, offset_mask3);
1267            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1268                      sao_offset, sao_offset, offset_mask0, offset_mask0,
1269                      offset_mask0);
1270            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1271                      sao_offset, sao_offset, offset_mask1, offset_mask1,
1272                      offset_mask1);
1273            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1274                      sao_offset, sao_offset, offset_mask2, offset_mask2,
1275                      offset_mask2);
1276            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1277                      sao_offset, sao_offset, offset_mask3, offset_mask3,
1278                      offset_mask3);
1279
1280            DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
1281                      src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
1282                      src_zero2, src_zero3);
1283            DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1284                      offset_mask1, src_zero2, offset_mask2, src_zero3,
1285                      offset_mask3, dst0, dst1, dst2, dst3);
1286            DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1287                      128, dst0, dst1, dst2, dst3);
1288
1289            src_minus10 = src10;
1290            src_minus11 = src11;
1291            src_minus12 = src12;
1292            src_minus13 = src13;
1293
1294            __lsx_vst(dst0, dst_ptr, 0);
1295            __lsx_vst(dst1, dst_ptr + dst_stride, 0);
1296            __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
1297            __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
1298        }
1299        src += src_stride_4x;
1300        dst += dst_stride_4x;
1301    }
1302}
1303
1304static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst,
1305                                                     int32_t dst_stride,
1306                                                     uint8_t *src,
1307                                                     int32_t src_stride,
1308                                                     int16_t *sao_offset_val,
1309                                                     int32_t height)
1310{
1311    const int32_t src_stride_2x = (src_stride << 1);
1312    const int32_t dst_stride_2x = (dst_stride << 1);
1313    __m128i edge_idx = {0x403000201, 0x0};
1314    __m128i const1 = __lsx_vldi(1);
1315    __m128i dst0;
1316    __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1317    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1318    __m128i src_minus10, src_minus11, src10, src11;
1319    __m128i src_zero0, src_zero1;
1320    __m128i offset;
1321    __m128i offset_mask0, offset_mask1;
1322
1323    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1324
1325    /* load in advance */
1326    DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
1327              src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
1328
1329    for (height -= 2; height; height -= 2) {
1330        src += src_stride_2x;
1331        DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1332                  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1333                  src_minus11, src_zero1);
1334        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1335                  cmp_minus10, cmp_minus11);
1336        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1337                  cmp_minus11, diff_minus10, diff_minus11);
1338        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1339                  src_minus11, cmp_minus10, cmp_minus11);
1340        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1341                  cmp_minus11, cmp_minus10, cmp_minus11);
1342        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1343                 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1344
1345        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1346                  diff_minus11, offset_mask0, offset_mask1);
1347        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1348                  offset_mask0, offset_mask1);
1349        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1350                  src_zero0, offset, dst0);
1351        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1352                  sao_offset, offset, offset, offset);
1353
1354        dst0 = __lsx_vxori_b(dst0, 128);
1355        dst0 = __lsx_vsadd_b(dst0, offset);
1356        dst0 = __lsx_vxori_b(dst0, 128);
1357        src_minus10 = src10;
1358        src_minus11 = src11;
1359
1360        /* load in advance */
1361        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1362                  src10, src11);
1363
1364        __lsx_vstelm_w(dst0, dst, 0, 0);
1365        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1366        dst += dst_stride_2x;
1367    }
1368
1369    DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1370              src11,  src_minus11, src10, src10, src_minus10, src_zero0,
1371              src_minus11, src_zero1);
1372    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1373              cmp_minus10, cmp_minus11);
1374    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1375              diff_minus10, diff_minus11);
1376    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1377              cmp_minus10, cmp_minus11);
1378    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1379              cmp_minus10, cmp_minus11);
1380    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1381              const1, cmp_minus11, diff_minus10, diff_minus11);
1382
1383    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1384              diff_minus11, offset_mask0, offset_mask1);
1385    DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1386              offset_mask0, offset_mask1);
1387    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1388              src_zero0, offset, dst0);
1389    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1390              sao_offset, offset, offset, offset);
1391    dst0 = __lsx_vxori_b(dst0, 128);
1392    dst0 = __lsx_vsadd_b(dst0, offset);
1393    dst0 = __lsx_vxori_b(dst0, 128);
1394
1395    __lsx_vstelm_w(dst0, dst, 0, 0);
1396    __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1397}
1398
1399static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst,
1400                                                     int32_t dst_stride,
1401                                                     uint8_t *src,
1402                                                     int32_t src_stride,
1403                                                     int16_t *sao_offset_val,
1404                                                     int32_t height)
1405{
1406    const int32_t src_stride_2x = (src_stride << 1);
1407    const int32_t dst_stride_2x = (dst_stride << 1);
1408    __m128i edge_idx = {0x403000201, 0x0};
1409    __m128i const1 = __lsx_vldi(1);
1410    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1411    __m128i src_zero0, src_zero1, dst0;
1412    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1413    __m128i src_minus10, src_minus11, src10, src11;
1414    __m128i offset_mask0, offset_mask1;
1415
1416    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1417
1418    /* load in advance */
1419    DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
1420    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
1421
1422    for (height -= 2; height; height -= 2) {
1423        src += src_stride_2x;
1424        DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1425                  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1426                  src_minus11, src_zero1);
1427        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1428                  cmp_minus10, cmp_minus11);
1429        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1430                  cmp_minus11, diff_minus10, diff_minus11);
1431        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1432                  src_minus11, cmp_minus10, cmp_minus11);
1433        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1434                  cmp_minus11, cmp_minus10, cmp_minus11);
1435        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1436                diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1437
1438        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1439                  diff_minus11, offset_mask0, offset_mask1);
1440        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1441                  offset_mask0, offset_mask1);
1442        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1443                  src_zero0, offset, dst0);
1444        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1445                  sao_offset, offset, offset, offset);
1446
1447        dst0 = __lsx_vxori_b(dst0, 128);
1448        dst0 = __lsx_vsadd_b(dst0, offset);
1449        dst0 = __lsx_vxori_b(dst0, 128);
1450        src_minus10 = src10;
1451        src_minus11 = src11;
1452
1453        /* load in advance */
1454        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1455                  src10, src11);
1456
1457        __lsx_vstelm_d(dst0, dst, 0, 0);
1458        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1459        dst += dst_stride_2x;
1460    }
1461
1462    DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1463              src11, src_minus11, src10, src10, src_minus10, src_zero0,
1464              src_minus11, src_zero1);
1465    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1466              cmp_minus10, cmp_minus11);
1467    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1468              diff_minus10, diff_minus11);
1469    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1470              cmp_minus10, cmp_minus11);
1471    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1472              cmp_minus10, cmp_minus11);
1473    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1474              const1, cmp_minus11, diff_minus10, diff_minus11);
1475
1476    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1477              diff_minus11, offset_mask0, offset_mask1);
1478    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1479              offset_mask0, offset_mask1);
1480    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1481              src_zero0, offset, dst0);
1482    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1483              sao_offset, offset, offset, offset);
1484    dst0 =  __lsx_vxori_b(dst0, 128);
1485    dst0 = __lsx_vsadd_b(dst0, offset);
1486    dst0 = __lsx_vxori_b(dst0, 128);
1487
1488    __lsx_vstelm_d(dst0, dst, 0, 0);
1489    __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1490}
1491
1492static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst,
1493                                                         int32_t dst_stride,
1494                                                         uint8_t *src,
1495                                                         int32_t src_stride,
1496                                                         int16_t *
1497                                                         sao_offset_val,
1498                                                         int32_t width,
1499                                                         int32_t height)
1500{
1501    uint8_t *src_orig = src;
1502    uint8_t *dst_orig = dst;
1503    int32_t h_cnt, v_cnt;
1504    const int32_t src_stride_2x = (src_stride << 1);
1505    const int32_t dst_stride_2x = (dst_stride << 1);
1506    const int32_t src_stride_4x = (src_stride << 2);
1507    const int32_t dst_stride_4x = (dst_stride << 2);
1508    const int32_t src_stride_3x = src_stride_2x + src_stride;
1509    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1510    __m128i edge_idx = {0x403000201, 0x0};
1511    __m128i const1 = __lsx_vldi(1);
1512    __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1513    __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1514    __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1515    __m128i diff_plus13;
1516    __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1517    __m128i src12, dst2, src13, dst3;
1518    __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1519
1520    sao_offset = __lsx_vld(sao_offset_val, 0);
1521    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1522
1523    for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1524        src = src_orig + v_cnt;
1525        dst = dst_orig + v_cnt;
1526
1527        DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
1528                  src_minus10, src_minus11);
1529
1530        for (h_cnt = (height >> 2); h_cnt--;) {
1531            DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1532                      src, src_stride_3x, src, src_stride_4x,
1533                      src10, src11, src12, src13);
1534            DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
1535                      src10, src10, src_minus11, src10, src11, cmp_minus10,
1536                      cmp_plus10, cmp_minus11, cmp_plus11);
1537            DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
1538                      src12, src13, cmp_minus12, cmp_plus12,
1539                      cmp_minus13, cmp_plus13);
1540            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1541                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1542                      cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1543                      diff_plus11);
1544            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1545                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1546                      cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1547                      diff_plus13);
1548            DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
1549                      src10, src10, src_minus11, src10, src11, cmp_minus10,
1550                      cmp_plus10, cmp_minus11, cmp_plus11);
1551            DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
1552                      src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
1553                      cmp_plus13);
1554            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1555                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1556                      cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1557                      cmp_plus11);
1558            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1559                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1560                      cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1561                      cmp_plus13);
1562            DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1563                      diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1564                      cmp_minus11, diff_plus11, const1, cmp_plus11,
1565                      diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1566            DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1567                      diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1568                      cmp_minus13, diff_plus13, const1, cmp_plus13,
1569                      diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1570
1571            DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1572                      diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1573                      diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1574                      offset_mask3);
1575            DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1576                      offset_mask2, 2, offset_mask3, 2, offset_mask0,
1577                      offset_mask1, offset_mask2, offset_mask3);
1578            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1579                      sao_offset, sao_offset, offset_mask0,\
1580                      offset_mask0, offset_mask0);
1581            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1582                      sao_offset, sao_offset, offset_mask1, offset_mask1,
1583                      offset_mask1);
1584            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1585                      sao_offset, sao_offset, offset_mask2, offset_mask2,
1586                      offset_mask2);
1587            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1588                      sao_offset, sao_offset, offset_mask3, offset_mask3,
1589                      offset_mask3);
1590
1591            src_minus10 = src12;
1592            DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
1593                      src12, 128, src_minus11, src10, src11, src12);
1594            DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
1595                      offset_mask1, src11, offset_mask2, src12,
1596                      offset_mask3, dst0, dst1, dst2, dst3);
1597            DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1598                      128, dst0, dst1, dst2, dst3);
1599            src_minus11 = src13;
1600
1601            __lsx_vst(dst0, dst, 0);
1602            __lsx_vstx(dst1, dst, dst_stride);
1603            __lsx_vstx(dst2, dst, dst_stride_2x);
1604            __lsx_vstx(dst3, dst, dst_stride_3x);
1605            src += src_stride_4x;
1606            dst += dst_stride_4x;
1607        }
1608    }
1609}
1610
1611static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst,
1612                                                     int32_t dst_stride,
1613                                                     uint8_t *src,
1614                                                     int32_t src_stride,
1615                                                     int16_t *sao_offset_val,
1616                                                     int32_t height)
1617{
1618    uint8_t *src_orig;
1619    const int32_t src_stride_2x = (src_stride << 1);
1620    const int32_t dst_stride_2x = (dst_stride << 1);
1621    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1622    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1623    __m128i edge_idx = {0x403000201, 0x0};
1624    __m128i const1 = __lsx_vldi(1);
1625    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1626    __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1627    __m128i src_minus11, src10, src11;
1628    __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1629    __m128i offset_mask0, offset_mask1;
1630    __m128i zeros = {0};
1631
1632    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1633    src_orig = src - 1;
1634
1635    /* load in advance */
1636    DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
1637              src_minus10, src_minus11);
1638    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1639              src10, src11);
1640
1641    for (height -= 2; height; height -= 2) {
1642        src_orig += src_stride_2x;
1643
1644        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1645                  shuf1, src_zero0, src_zero1);
1646        DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1647                  src_plus0, src_plus1);
1648
1649        DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
1650                  src_minus11, src_minus10, src_minus11);
1651        DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
1652                  src_zero1, src_zero0, src_zero1);
1653        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
1654                  src_minus11, cmp_minus10, cmp_minus11);
1655        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1656                  cmp_minus11, diff_minus10, diff_minus11);
1657        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1658                  src_minus11, cmp_minus10, cmp_minus11);
1659        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1660                  cmp_minus11, cmp_minus10, cmp_minus11);
1661        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1662             diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1663
1664        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1665                  diff_minus11, offset_mask0, offset_mask1);
1666        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1667                  offset_mask0, offset_mask1);
1668        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1669                  src_zero0, offset, dst0);
1670        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1671                  sao_offset, offset, offset, offset);
1672        dst0 = __lsx_vxori_b(dst0, 128);
1673        dst0 = __lsx_vsadd_b(dst0, offset);
1674        dst0 = __lsx_vxori_b(dst0, 128);
1675
1676        src_minus10 = src10;
1677        src_minus11 = src11;
1678
1679        /* load in advance */
1680        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1681                  src10, src11);
1682
1683        __lsx_vstelm_w(dst0, dst, 0, 0);
1684        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1685        dst += dst_stride_2x;
1686    }
1687
1688    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1689              src_zero0, src_zero1);
1690    DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1691              src_plus0, src_plus1);
1692
1693    DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
1694              src_minus10, src_minus11);
1695    DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1696              src_zero0, src_zero1);
1697    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1698              cmp_minus10, cmp_minus11);
1699    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1700              diff_minus10, diff_minus11);
1701    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1702              cmp_minus10, cmp_minus11);
1703    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1704              cmp_minus10, cmp_minus11);
1705    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1706              const1, cmp_minus11, diff_minus10, diff_minus11);
1707
1708    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1709              diff_minus11, offset_mask0, offset_mask1);
1710    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1711              offset_mask1);
1712    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1713              src_zero0, offset, dst0);
1714    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1715              sao_offset, offset, offset, offset);
1716    dst0 = __lsx_vxori_b(dst0, 128);
1717    dst0 = __lsx_vsadd_b(dst0, offset);
1718    dst0 = __lsx_vxori_b(dst0, 128);
1719
1720    __lsx_vstelm_w(dst0, dst, 0, 0);
1721    __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1722}
1723
1724static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst,
1725                                                     int32_t dst_stride,
1726                                                     uint8_t *src,
1727                                                     int32_t src_stride,
1728                                                     int16_t *sao_offset_val,
1729                                                     int32_t height)
1730{
1731    uint8_t *src_orig;
1732    const int32_t src_stride_2x = (src_stride << 1);
1733    const int32_t dst_stride_2x = (dst_stride << 1);
1734    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1735    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1736    __m128i edge_idx = {0x403000201, 0x0};
1737    __m128i const1 = __lsx_vldi(1);
1738    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1739    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1740    __m128i src_minus10, src10, src_minus11, src11;
1741    __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1742    __m128i offset_mask0, offset_mask1;
1743    __m128i zeros = {0};
1744
1745    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1746    src_orig = src - 1;
1747
1748    /* load in advance */
1749    DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
1750              src_minus11);
1751    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1752              src10, src11);
1753
1754    for (height -= 2; height; height -= 2) {
1755        src_orig += src_stride_2x;
1756
1757        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1758                  shuf1, src_zero0, src_zero1);
1759        DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1760                  src_plus10, src_plus11);
1761
1762        DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
1763                  src_minus11, src_minus10, src_minus11);
1764        DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1765                  src_zero0, src_zero1);
1766        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1767                  cmp_minus10, cmp_minus11);
1768        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1769                  cmp_minus11, diff_minus10, diff_minus11);
1770        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1771                  src_minus11, cmp_minus10, cmp_minus11);
1772        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1773                  cmp_minus11, cmp_minus10, cmp_minus11);
1774        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1775               diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
1776
1777        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1778                  diff_minus11, offset_mask0, offset_mask1);
1779        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1780                  offset_mask0, offset_mask1);
1781        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1782                  src_zero0, offset, dst0);
1783        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1784                  sao_offset, offset, offset, offset);
1785        dst0 = __lsx_vxori_b(dst0, 128);
1786        dst0 = __lsx_vsadd_b(dst0, offset);
1787        dst0 = __lsx_vxori_b(dst0, 128);
1788
1789        src_minus10 = src10;
1790        src_minus11 = src11;
1791
1792        /* load in advance */
1793        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1794                  src10, src11)
1795        __lsx_vstelm_d(dst0, dst, 0, 0);
1796        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1797        dst += dst_stride_2x;
1798    }
1799
1800    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1801              src_zero0, src_zero1);
1802    DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1803              src_plus10, src_plus11);
1804    DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
1805              src_minus10, src_minus11);
1806    DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1807              src_zero0, src_zero1);
1808
1809    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1810              cmp_minus10, cmp_minus11);
1811    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1812              cmp_minus11, diff_minus10, diff_minus11);
1813    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1814              cmp_minus10, cmp_minus11);
1815    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1816              cmp_minus10, cmp_minus11);
1817    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1818              const1, cmp_minus11, diff_minus10, diff_minus11);
1819
1820    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1821              diff_minus11, offset_mask0, offset_mask1);
1822    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1823              offset_mask1);
1824    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1825              src_zero0, offset, dst0);
1826    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1827              sao_offset, offset, offset, offset);
1828    dst0 = __lsx_vxori_b(dst0, 128);
1829    dst0 = __lsx_vsadd_b(dst0, offset);
1830    dst0 = __lsx_vxori_b(dst0, 128);
1831
1832    src_minus10 = src10;
1833    src_minus11 = src11;
1834
1835    /* load in advance */
1836    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1837              src10, src11);
1838
1839    __lsx_vstelm_d(dst0, dst, 0, 0);
1840    __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1841}
1842
1843static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst,
1844                                                         int32_t dst_stride,
1845                                                         uint8_t *src,
1846                                                         int32_t src_stride,
1847                                                         int16_t *
1848                                                         sao_offset_val,
1849                                                         int32_t width,
1850                                                         int32_t height)
1851{
1852    uint8_t *src_orig = src;
1853    uint8_t *dst_orig = dst;
1854    int32_t v_cnt;
1855    const int32_t src_stride_2x = (src_stride << 1);
1856    const int32_t dst_stride_2x = (dst_stride << 1);
1857    const int32_t src_stride_4x = (src_stride << 2);
1858    const int32_t dst_stride_4x = (dst_stride << 2);
1859    const int32_t src_stride_3x = src_stride_2x + src_stride;
1860    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1861
1862    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1863    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1864    __m128i edge_idx = {0x403000201, 0x0};
1865    __m128i const1 = __lsx_vldi(1);
1866    __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1867    __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1868    __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1869    __m128i diff_plus13, src_minus14, src_plus13;
1870    __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1871    __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1872    __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
1873    __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
1874    __m128i src_zero3, sao_offset, src_plus12;
1875
1876    sao_offset = __lsx_vld(sao_offset_val, 0);
1877    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1878
1879    for (; height; height -= 4) {
1880        src_orig = src - 1;
1881        dst_orig = dst;
1882        src_minus11 = __lsx_vld(src_orig, 0);
1883        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1884                  src_minus12, src_minus13);
1885        src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
1886
1887        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1888            src_minus10 = __lsx_vld(src_orig - src_stride, 0);
1889            src_orig += 16;
1890            src10 = __lsx_vld(src_orig, 0);
1891            DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
1892                      src_stride_2x, src11, src12);
1893            src13 = __lsx_vldx(src_orig, src_stride_3x);
1894            src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
1895
1896            DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
1897                      src_minus12, shuf1, src12, src_minus13, shuf1,
1898                      src13, src_minus14, shuf1, src_zero0, src_zero1,
1899                      src_zero2, src_zero3);
1900            DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
1901                      src_minus13, shuf2, src_plus10, src_plus11);
1902            src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
1903
1904            DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1905                      src_plus10, src_zero1, src_minus11, src_zero1,
1906                      src_plus11, cmp_minus10, cmp_plus10,
1907                      cmp_minus11, cmp_plus11);
1908            DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1909                      src_plus12, src_zero3, src_minus13, src_zero3,
1910                      src_plus13, cmp_minus12, cmp_plus12,
1911                      cmp_minus13, cmp_plus13);
1912            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1913                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1914                      cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1915                      diff_plus11);
1916            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1917                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1918                      cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1919                      diff_plus13);
1920            DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1921                      src_plus10, src_zero1, src_minus11, src_zero1,
1922                      src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1923                      cmp_plus11);
1924            DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1925                      src_plus12, src_zero3, src_minus13, src_zero3,
1926                      src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1927                      cmp_plus13);
1928            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1929                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1930                      cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1931                      cmp_plus11);
1932            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1933                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1934                      cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1935                      cmp_plus13);
1936            DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1937                      diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1938                      cmp_minus11, diff_plus11, const1, cmp_plus11,
1939                      diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1940            DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1941                      diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1942                      cmp_minus13, diff_plus13, const1, cmp_plus13,
1943                      diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1944
1945            DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1946                      diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1947                      diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1948                      offset_mask3);
1949            DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1950                      offset_mask2, 2, offset_mask3, 2, offset_mask0,
1951                      offset_mask1, offset_mask2, offset_mask3);
1952
1953            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1954                      sao_offset, sao_offset, offset_mask0, offset_mask0,
1955                      offset_mask0);
1956            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1957                      sao_offset, sao_offset, offset_mask1, offset_mask1,
1958                      offset_mask1);
1959            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1960                      sao_offset, sao_offset, offset_mask2, offset_mask2,
1961                      offset_mask2);
1962            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1963                      sao_offset, sao_offset, offset_mask3, offset_mask3,
1964                      offset_mask3);
1965
1966            DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
1967                      128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
1968                      src_zero3);
1969            DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1970                      offset_mask1, src_zero2, offset_mask2, src_zero3,
1971                      offset_mask3, dst0, dst1, dst2, dst3);
1972            DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1973                      128, dst0, dst1, dst2, dst3);
1974
1975            src_minus11 = src10;
1976            src_minus12 = src11;
1977            src_minus13 = src12;
1978            src_minus14 = src13;
1979
1980            __lsx_vst(dst0, dst_orig, 0);
1981            __lsx_vstx(dst1, dst_orig, dst_stride);
1982            __lsx_vstx(dst2, dst_orig, dst_stride_2x);
1983            __lsx_vstx(dst3, dst_orig, dst_stride_3x);
1984            dst_orig += 16;
1985        }
1986        src += src_stride_4x;
1987        dst += dst_stride_4x;
1988    }
1989}
1990
1991static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst,
1992                                                      int32_t dst_stride,
1993                                                      uint8_t *src,
1994                                                      int32_t src_stride,
1995                                                      int16_t *sao_offset_val,
1996                                                      int32_t height)
1997{
1998    uint8_t *src_orig;
1999    const int32_t src_stride_2x = (src_stride << 1);
2000    const int32_t dst_stride_2x = (dst_stride << 1);
2001
2002    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2003    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2004    __m128i edge_idx = {0x403000201, 0x0};
2005    __m128i const1 = __lsx_vldi(1);
2006    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2007    __m128i src_zero0, src_zero1, dst0;
2008    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2009    __m128i src_minus10, src10, src_minus11, src11;
2010    __m128i offset_mask0, offset_mask1;
2011    __m128i zeros = {0};
2012
2013    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2014    src_orig = src - 1;
2015
2016    /* load in advance */
2017    DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2018              src_minus10, src_minus11);
2019    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2020              src10, src11);
2021
2022    for (height -= 2; height; height -= 2) {
2023        src_orig += src_stride_2x;
2024
2025        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2026                  shuf1, src_zero0, src_zero1);
2027        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2028                  shuf2, src_minus10, src_minus11);
2029
2030        DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2031                  src_minus10, src_minus11);
2032        DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2033                  src_zero0, src_zero1);
2034        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2035                  cmp_minus10, cmp_minus11);
2036        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2037                  cmp_minus11, diff_minus10, diff_minus11);
2038        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2039                  src_minus11, cmp_minus10, cmp_minus11);
2040        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2041                  cmp_minus11, cmp_minus10, cmp_minus11);
2042        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2043               diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
2044
2045        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2046                  diff_minus11, offset_mask0, offset_mask1);
2047        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2048                  offset_mask0, offset_mask1);
2049        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2050                  src_zero0, offset, dst0);
2051        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2052                  sao_offset, offset, offset, offset);
2053        dst0 = __lsx_vxori_b(dst0, 128);
2054        dst0 = __lsx_vsadd_b(dst0, offset);
2055        dst0 = __lsx_vxori_b(dst0, 128);
2056
2057        src_minus10 = src10;
2058        src_minus11 = src11;
2059
2060        /* load in advance */
2061        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2062                  src10, src11);
2063
2064        __lsx_vstelm_w(dst0, dst, 0, 0);
2065        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2066        dst += dst_stride_2x;
2067    }
2068
2069    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2070              src_zero0, src_zero1);
2071    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2072              shuf2, src_minus10, src_minus11);
2073
2074    DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2075              src_minus10, src_minus11);
2076    DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2077              src_zero0, src_zero1);
2078    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2079              cmp_minus10, cmp_minus11);
2080    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2081              cmp_minus11, diff_minus10, diff_minus11);
2082    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2083              cmp_minus10, cmp_minus11);
2084    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2085              cmp_minus10, cmp_minus11);
2086    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2087              const1, cmp_minus11, diff_minus10, diff_minus11);
2088
2089    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2090              diff_minus11, offset_mask0, offset_mask1);
2091    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2092              offset_mask1);
2093    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2094              src_zero0, offset, dst0);
2095    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2096              sao_offset, offset, offset, offset);
2097    dst0 = __lsx_vxori_b(dst0, 128);
2098    dst0 = __lsx_vsadd_b(dst0, offset);
2099    dst0 = __lsx_vxori_b(dst0, 128);
2100
2101    __lsx_vstelm_w(dst0, dst, 0, 0);
2102    __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2103    dst += dst_stride_2x;
2104}
2105
2106static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst,
2107                                                      int32_t dst_stride,
2108                                                      uint8_t *src,
2109                                                      int32_t src_stride,
2110                                                      int16_t *sao_offset_val,
2111                                                      int32_t height)
2112{
2113    uint8_t *src_orig;
2114    const int32_t src_stride_2x = (src_stride << 1);
2115    const int32_t dst_stride_2x = (dst_stride << 1);
2116
2117    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2118    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2119    __m128i edge_idx = {0x403000201, 0x0};
2120    __m128i const1 = __lsx_vldi(1);
2121    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2122    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2123    __m128i src_minus10, src10, src_minus11, src11;
2124    __m128i src_zero0, src_zero1, dst0;
2125    __m128i offset_mask0, offset_mask1;
2126    __m128i zeros = {0};
2127
2128    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2129    src_orig = src - 1;
2130
2131    /* load in advance */
2132    DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2133              src_minus10, src_minus11);
2134    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2135              src10, src11);
2136
2137    for (height -= 2; height; height -= 2) {
2138        src_orig += src_stride_2x;
2139
2140        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2141                  shuf1, src_zero0, src_zero1);
2142        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2143                  shuf2, src_minus10, src_minus11);
2144
2145        DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2146                  src_minus10, src_minus11);
2147        DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2148                  src_zero0, src_zero1);
2149        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2150                  cmp_minus10, cmp_minus11);
2151        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2152                  cmp_minus11, diff_minus10, diff_minus11);
2153        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2154                  src_minus11, cmp_minus10, cmp_minus11);
2155        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2156                  cmp_minus11, cmp_minus10, cmp_minus11);
2157        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2158              diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
2159
2160        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2161                  diff_minus11, offset_mask0, offset_mask1);
2162        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2163                  offset_mask0, offset_mask1);
2164        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2165                  src_zero0, offset, dst0);
2166        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2167                  sao_offset, offset, offset, offset);
2168        dst0 = __lsx_vxori_b(dst0, 128);
2169        dst0 = __lsx_vsadd_b(dst0, offset);
2170        dst0 = __lsx_vxori_b(dst0, 128);
2171
2172        src_minus10 = src10;
2173        src_minus11 = src11;
2174
2175        /* load in advance */
2176        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2177                  src10, src11);
2178
2179        __lsx_vstelm_d(dst0, dst, 0, 0);
2180        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2181        dst += dst_stride_2x;
2182    }
2183
2184    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2185              src_zero0, src_zero1);
2186    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2187              shuf2, src_minus10, src_minus11);
2188
2189    DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2190              src_minus10, src_minus11);
2191    DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2192              src_zero0, src_zero1);
2193    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2194              cmp_minus10, cmp_minus11);
2195    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2196              diff_minus10, diff_minus11);
2197    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2198              cmp_minus10, cmp_minus11);
2199    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2200              cmp_minus10, cmp_minus11);
2201    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2202              const1, cmp_minus11, diff_minus10, diff_minus11);
2203
2204    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2205              diff_minus11, offset_mask0, offset_mask1);
2206    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2207              offset_mask1);
2208    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2209              src_zero0, offset, dst0);
2210    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2211              sao_offset, offset, offset, offset);
2212    dst0 = __lsx_vxori_b(dst0, 128);
2213    dst0 = __lsx_vsadd_b(dst0, offset);
2214    dst0 = __lsx_vxori_b(dst0, 128);
2215
2216    __lsx_vstelm_d(dst0, dst, 0, 0);
2217    __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2218}
2219
2220static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst,
2221                                                          int32_t dst_stride,
2222                                                          uint8_t *src,
2223                                                          int32_t src_stride,
2224                                                          int16_t *sao_offset_val,
2225                                                          int32_t width,
2226                                                          int32_t height)
2227{
2228    uint8_t *src_orig, *dst_orig;
2229    int32_t v_cnt;
2230    const int32_t src_stride_2x = (src_stride << 1);
2231    const int32_t dst_stride_2x = (dst_stride << 1);
2232    const int32_t src_stride_4x = (src_stride << 2);
2233    const int32_t dst_stride_4x = (dst_stride << 2);
2234    const int32_t src_stride_3x = src_stride_2x + src_stride;
2235    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2236
2237    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2238    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2239    __m128i edge_idx = {0x403000201, 0x0};
2240    __m128i const1 = __lsx_vldi(1);
2241    __m128i dst0, dst1, dst2, dst3;
2242    __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2243    __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2244    __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2245    __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2246    __m128i src_plus10, src_plus11, src_plus12, src_plus13;
2247    __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2248    __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2249
2250    sao_offset = __lsx_vld(sao_offset_val, 0);
2251    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2252
2253    for (; height; height -= 4) {
2254        src_orig = src - 1;
2255        dst_orig = dst;
2256
2257        src_minus11 = __lsx_vld(src_orig, 0);
2258        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2259                  src_plus10, src_plus11);
2260        src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
2261
2262        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2263            src_minus10 = __lsx_vld(src_orig - src_stride, 2);
2264            src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
2265            src_orig += 16;
2266            src10 = __lsx_vld(src_orig, 0);
2267            DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2268                      src11, src12);
2269            src13 =__lsx_vldx(src_orig, src_stride_3x);
2270
2271            DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
2272                      src_plus10,  shuf1, src12, src_plus11, shuf1, src13,
2273                      src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
2274                      src_zero3);
2275            src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
2276            DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
2277                      src_plus11, shuf2, src_minus12, src_minus13);
2278
2279            DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
2280                      src_plus10,  src_zero1, src_minus11, src_zero1,
2281                      src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2282                      cmp_plus11);
2283            DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
2284                      src_plus12, src_zero3, src_minus13, src_zero3,
2285                      src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2286                      cmp_plus13);
2287            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2288                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2289                      cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
2290                      diff_plus11);
2291            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2292                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2293                      cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
2294                      diff_plus13);
2295            DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
2296                      src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
2297                      cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
2298            DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
2299                      src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
2300                      cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
2301            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2302                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2303                      cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2304                      cmp_plus11);
2305            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2306                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2307                      cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2308                      cmp_plus13);
2309            DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2310                      diff_plus10, const1, cmp_plus10, diff_minus11, const1,
2311                      cmp_minus11, diff_plus11, const1, cmp_plus11,
2312                      diff_minus10, diff_plus10, diff_minus11, diff_plus11);
2313            DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
2314                      diff_plus12, const1, cmp_plus12, diff_minus13, const1,
2315                      cmp_minus13, diff_plus13, const1, cmp_plus13,
2316                      diff_minus12, diff_plus12, diff_minus13, diff_plus13);
2317
2318            DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
2319                      diff_plus11, diff_minus12, diff_plus12, diff_minus13,
2320                      diff_plus13, offset_mask0, offset_mask1, offset_mask2,
2321                      offset_mask3);
2322            DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
2323                      offset_mask2, 2, offset_mask3, 2, offset_mask0,
2324                      offset_mask1, offset_mask2, offset_mask3);
2325
2326            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
2327                      sao_offset, sao_offset, offset_mask0, offset_mask0,
2328                      offset_mask0);
2329            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
2330                      sao_offset, sao_offset, offset_mask1, offset_mask1,
2331                      offset_mask1);
2332            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
2333                      sao_offset, sao_offset, offset_mask2, offset_mask2,
2334                      offset_mask2);
2335            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
2336                      sao_offset, sao_offset, offset_mask3, offset_mask3,
2337                      offset_mask3);
2338
2339            DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
2340                      src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
2341                      src_zero2, src_zero3);
2342            DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
2343                      offset_mask1, src_zero2, offset_mask2, src_zero3,
2344                      offset_mask3, dst0, dst1, dst2, dst3);
2345            DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
2346                      128, dst0, dst1, dst2, dst3);
2347
2348            src_minus11 = src10;
2349            src_plus10 = src11;
2350            src_plus11 = src12;
2351            src_plus12 = src13;
2352
2353            __lsx_vst(dst0, dst_orig, 0);
2354            __lsx_vstx(dst1, dst_orig, dst_stride);
2355            __lsx_vstx(dst2, dst_orig, dst_stride_2x);
2356            __lsx_vstx(dst3, dst_orig, dst_stride_3x);
2357            dst_orig += 16;
2358        }
2359
2360        src += src_stride_4x;
2361        dst += dst_stride_4x;
2362    }
2363}
2364
2365void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
2366                                   ptrdiff_t stride_dst,
2367                                   int16_t *sao_offset_val,
2368                                   int eo, int width, int height)
2369{
2370    ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
2371
2372    switch (eo) {
2373    case 0:
2374        if (width >> 4) {
2375            hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst,
2376                                                        src, stride_src,
2377                                                        sao_offset_val,
2378                                                        width - (width & 0x0F),
2379                                                        height);
2380            dst += width & 0xFFFFFFF0;
2381            src += width & 0xFFFFFFF0;
2382            width &= 0x0F;
2383        }
2384
2385        if (width >> 3) {
2386            hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst,
2387                                                    src, stride_src,
2388                                                    sao_offset_val, height);
2389            dst += 8;
2390            src += 8;
2391            width &= 0x07;
2392        }
2393
2394        if (width) {
2395            hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst,
2396                                                    src, stride_src,
2397                                                    sao_offset_val, height);
2398        }
2399        break;
2400
2401    case 1:
2402        if (width >> 4) {
2403            hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst,
2404                                                         src, stride_src,
2405                                                         sao_offset_val,
2406                                                         width - (width & 0x0F),
2407                                                         height);
2408            dst += width & 0xFFFFFFF0;
2409            src += width & 0xFFFFFFF0;
2410            width &= 0x0F;
2411        }
2412
2413        if (width >> 3) {
2414            hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst,
2415                                                     src, stride_src,
2416                                                     sao_offset_val, height);
2417            dst += 8;
2418            src += 8;
2419            width &= 0x07;
2420        }
2421
2422        if (width) {
2423            hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst,
2424                                                     src, stride_src,
2425                                                     sao_offset_val, height);
2426        }
2427        break;
2428
2429    case 2:
2430        if (width >> 4) {
2431            hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst,
2432                                                         src, stride_src,
2433                                                         sao_offset_val,
2434                                                         width - (width & 0x0F),
2435                                                         height);
2436            dst += width & 0xFFFFFFF0;
2437            src += width & 0xFFFFFFF0;
2438            width &= 0x0F;
2439        }
2440
2441        if (width >> 3) {
2442            hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst,
2443                                                     src, stride_src,
2444                                                     sao_offset_val, height);
2445            dst += 8;
2446            src += 8;
2447            width &= 0x07;
2448        }
2449
2450        if (width) {
2451            hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst,
2452                                                     src, stride_src,
2453                                                     sao_offset_val, height);
2454        }
2455        break;
2456
2457    case 3:
2458        if (width >> 4) {
2459            hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst,
2460                                                          src, stride_src,
2461                                                          sao_offset_val,
2462                                                          width - (width & 0x0F),
2463                                                          height);
2464            dst += width & 0xFFFFFFF0;
2465            src += width & 0xFFFFFFF0;
2466            width &= 0x0F;
2467        }
2468
2469        if (width >> 3) {
2470            hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst,
2471                                                      src, stride_src,
2472                                                      sao_offset_val, height);
2473            dst += 8;
2474            src += 8;
2475            width &= 0x07;
2476        }
2477
2478        if (width) {
2479            hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst,
2480                                                      src, stride_src,
2481                                                      sao_offset_val, height);
2482        }
2483        break;
2484    }
2485}
2486