1 /*
2 * Copyright (c) 2022 Loongson Technology Corporation Limited
3 * Contributed by Lu Wang <wanglu@loongson.cn>
4 * Hao Chen <chenhao@loongson.cn>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 #include "hevcdsp_lsx.h"
25
ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t beta, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)26 void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
27 int32_t beta, int32_t *tc,
28 uint8_t *p_is_pcm, uint8_t *q_is_pcm)
29 {
30 ptrdiff_t stride_2x = (stride << 1);
31 ptrdiff_t stride_4x = (stride << 2);
32 ptrdiff_t stride_3x = stride_2x + stride;
33 uint8_t *p3 = src - stride_4x;
34 uint8_t *p2 = src - stride_3x;
35 uint8_t *p1 = src - stride_2x;
36 uint8_t *p0 = src - stride;
37 uint8_t *q0 = src;
38 uint8_t *q1 = src + stride;
39 uint8_t *q2 = src + stride_2x;
40 uint8_t *q3 = src + stride_3x;
41 uint8_t flag0, flag1;
42 int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
43 int32_t dp04, dq04, dp34, dq34, d04, d34;
44 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
45 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
46
47 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
48 __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
49 __m128i temp0, temp1;
50 __m128i temp2, tc_pos, tc_neg;
51 __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
52 __m128i zero = {0};
53 __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
54
55 dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
56 dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
57 dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
58 dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
59 d00 = dp00 + dq00;
60 d30 = dp30 + dq30;
61 dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
62 dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
63 dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
64 dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
65 d04 = dp04 + dq04;
66 d34 = dp34 + dq34;
67
68 p_is_pcm0 = p_is_pcm[0];
69 p_is_pcm4 = p_is_pcm[1];
70 q_is_pcm0 = q_is_pcm[0];
71 q_is_pcm4 = q_is_pcm[1];
72
73 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
74 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
75 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
76 d0030 = (d00 + d30) >= beta;
77 d0434 = (d04 + d34) >= beta;
78 DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
79 cmp3 = __lsx_vpackev_w(cmp1, cmp0);
80 cmp3 = __lsx_vseqi_w(cmp3, 0);
81
82 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
83 (!d0030 || !d0434)) {
84 DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
85 p3_src, p2_src, p1_src, p0_src);
86 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
87 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
88 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
89
90 tc0 = tc[0];
91 beta30 = beta >> 3;
92 beta20 = beta >> 2;
93 tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
94 tc4 = tc[1];
95 tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
96
97 DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
98 DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
99 p0_src, p3_src, p2_src, p1_src, p0_src);
100 DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
101 q0_src, q1_src, q2_src, q3_src);
102 flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
103 abs(p0[0] - q0[0]) < tc250;
104 flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
105 abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
106 (d30 << 1) < beta20);
107 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
108 DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
109 zero, q3_src, q0_src, q1_src, q2_src, q3_src);
110
111 flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
112 abs(p0[4] - q0[4]) < tc254;
113 flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
114 abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
115 (d34 << 1) < beta20);
116 DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
117 cmp2 = __lsx_vpackev_w(cmp1, cmp0);
118 cmp2 = __lsx_vseqi_w(cmp2, 0);
119
120 if (flag0 && flag1) { /* strong only */
121 /* strong filter */
122 tc_pos = __lsx_vslli_h(tc_pos, 1);
123 tc_neg = __lsx_vneg_h(tc_pos);
124
125 /* p part */
126 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
127 temp0, temp0);
128 temp1 = __lsx_vadd_h(p3_src, p2_src);
129 temp1 = __lsx_vslli_h(temp1, 1);
130 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
131 temp1 = __lsx_vsrari_h(temp1, 3);
132 temp2 = __lsx_vsub_h(temp1, p2_src);
133 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
134 dst0 = __lsx_vadd_h(temp2, p2_src);
135
136 temp1 = __lsx_vadd_h(temp0, p2_src);
137 temp1 = __lsx_vsrari_h(temp1, 2);
138 temp2 = __lsx_vsub_h(temp1, p1_src);
139 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
140 dst1 = __lsx_vadd_h(temp2, p1_src);
141
142 temp1 = __lsx_vslli_h(temp0, 1);
143 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
144 temp1, temp1);
145 temp1 = __lsx_vsrari_h(temp1, 3);
146 temp2 = __lsx_vsub_h(temp1, p0_src);
147 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
148 dst2 = __lsx_vadd_h(temp2, p0_src);
149
150 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
151 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
152 p1_src, p_is_pcm_vec, dst0, dst1);
153 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
154
155 /* q part */
156 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
157 temp0, temp0);
158 temp1 = __lsx_vadd_h(q3_src, q2_src);
159 temp1 = __lsx_vslli_h(temp1, 1);
160 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
161 temp1 = __lsx_vsrari_h(temp1, 3);
162 temp2 = __lsx_vsub_h(temp1, q2_src);
163 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
164 dst5 = __lsx_vadd_h(temp2, q2_src);
165
166 temp1 = __lsx_vadd_h(temp0, q2_src);
167 temp1 = __lsx_vsrari_h(temp1, 2);
168 temp2 = __lsx_vsub_h(temp1, q1_src);
169 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
170 dst4 = __lsx_vadd_h(temp2, q1_src);
171
172 temp0 = __lsx_vslli_h(temp0, 1);
173 DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
174 temp1, temp1);
175 temp1 = __lsx_vsrari_h(temp1, 3);
176 temp2 = __lsx_vsub_h(temp1, q0_src);
177 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
178 dst3 = __lsx_vadd_h(temp2, q0_src);
179
180 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
181 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
182 q1_src, q_is_pcm_vec, dst3, dst4);
183 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
184
185 /* pack results to 8 bit */
186 DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
187 dst2 = __lsx_vpickev_b(dst5, dst4);
188
189 /* pack src to 8 bit */
190 DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
191 dst3, dst4);
192 dst5 = __lsx_vpickev_b(q2_src, q1_src);
193
194 cmp3 = __lsx_vnor_v(cmp3, cmp3);
195 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
196 dst0, dst1);
197 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
198
199 __lsx_vstelm_d(dst0, p2, 0, 0);
200 __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
201 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
202 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
203 __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
204 __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
205 /* strong filter ends */
206 } else if (flag0 == flag1) { /* weak only */
207 /* weak filter */
208 tc_neg = __lsx_vneg_h(tc_pos);
209 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
210 diff0, diff1);
211 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
212 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
213 delta0 = __lsx_vsub_h(diff0, diff1);
214 delta0 = __lsx_vsrari_h(delta0, 4);
215 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
216 __lsx_vslli_h(tc_pos, 1));
217 abs_delta0 = __lsx_vadda_h(delta0, zero);
218 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
219 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
220
221 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
222 temp2 = __lsx_vadd_h(delta0, p0_src);
223 temp2 = __lsx_vclip255_h(temp2);
224 temp0 = __lsx_vbitsel_v(temp2, p0_src,
225 __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
226 temp2 = __lsx_vsub_h(q0_src, delta0);
227 temp2 = __lsx_vclip255_h(temp2);
228 temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
229 q_is_pcm_vec));
230 DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
231 q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
232
233 tmp = (beta + (beta >> 1)) >> 3;
234 DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
235 cmp0, cmp1);
236 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
237 cmp0 = __lsx_vseqi_d(cmp0, 0);
238 p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
239
240 DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
241 cmp0, cmp1);
242 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
243 cmp0 = __lsx_vseqi_d(cmp0, 0);
244 q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
245 tc_pos = __lsx_vsrai_h(tc_pos, 1);
246 tc_neg = __lsx_vneg_h(tc_pos);
247
248 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
249 delta1, delta2);
250 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
251 delta1, delta2);
252 delta1 = __lsx_vadd_h(delta1, delta0);
253 delta2 = __lsx_vsub_h(delta2, delta0);
254 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
255 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
256 tc_neg, tc_pos, delta1, delta2);
257 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
258 delta1, delta2);
259 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
260 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
261 q1_src, q_is_pcm_vec, delta1, delta2);
262
263 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
264 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
265 p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
266 q1_src, abs_delta0, dst1, dst2, dst3, dst4);
267 /* pack results to 8 bit */
268 DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
269 /* pack src to 8 bit */
270 DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
271 dst2, dst3);
272 cmp3 = __lsx_vnor_v(cmp3, cmp3);
273 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
274 dst0, dst1);
275
276 p2 += stride;
277 __lsx_vstelm_d(dst0, p2, 0, 0);
278 __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
279 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
280 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
281 /* weak filter ends */
282 } else { /* strong + weak */
283 /* strong filter */
284 tc_pos = __lsx_vslli_h(tc_pos, 1);
285 tc_neg = __lsx_vneg_h(tc_pos);
286
287 /* p part */
288 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
289 temp0, temp0);
290 temp1 = __lsx_vadd_h(p3_src, p2_src);
291 temp1 = __lsx_vslli_h(temp1, 1);
292 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
293 temp1 = __lsx_vsrari_h(temp1, 3);
294 temp2 = __lsx_vsub_h(temp1, p2_src);
295 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
296 dst0 = __lsx_vadd_h(temp2, p2_src);
297
298 temp1 = __lsx_vadd_h(temp0, p2_src);
299 temp1 = __lsx_vsrari_h(temp1, 2);
300 temp2 = __lsx_vsub_h(temp1, p1_src);
301 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
302 dst1 = __lsx_vadd_h(temp2, p1_src);
303
304 temp1 = __lsx_vslli_h(temp0, 1);
305 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
306 temp1 = __lsx_vsrari_h(temp1, 3);
307 temp2 = __lsx_vsub_h(temp1, p0_src);
308 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
309 dst2 = __lsx_vadd_h(temp2, p0_src);
310
311 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
312 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
313 p1_src, p_is_pcm_vec, dst0, dst1);
314 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
315
316 /* q part */
317 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
318 temp0, temp0);
319 temp1 = __lsx_vadd_h(q3_src, q2_src);
320 temp1 = __lsx_vslli_h(temp1, 1);
321 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
322 temp1 = __lsx_vsrari_h(temp1, 3);
323 temp2 = __lsx_vsub_h(temp1, q2_src);
324 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
325 dst5 = __lsx_vadd_h(temp2, q2_src);
326
327 temp1 = __lsx_vadd_h(temp0, q2_src);
328 temp1 = __lsx_vsrari_h(temp1, 2);
329 temp2 = __lsx_vsub_h(temp1, q1_src);
330 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
331 dst4 = __lsx_vadd_h(temp2, q1_src);
332
333 temp1 = __lsx_vslli_h(temp0, 1);
334 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
335 temp1 = __lsx_vsrari_h(temp1, 3);
336 temp2 = __lsx_vsub_h(temp1, q0_src);
337 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
338 dst3 = __lsx_vadd_h(temp2, q0_src);
339
340 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
341 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
342 q1_src, q_is_pcm_vec, dst3, dst4);
343 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
344
345 /* pack strong results to 8 bit */
346 DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
347 dst2 = __lsx_vpickev_b(dst5, dst4);
348 /* strong filter ends */
349
350 /* weak filter */
351 tc_pos = __lsx_vsrai_h(tc_pos, 1);
352 tc_neg = __lsx_vneg_h(tc_pos);
353
354 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
355 diff0, diff1);
356 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
357 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
358 delta0 = __lsx_vsub_h(diff0, diff1);
359 delta0 = __lsx_vsrari_h(delta0, 4);
360 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
361 __lsx_vslli_h(tc_pos, 1));
362 abs_delta0 = __lsx_vadda_h(delta0, zero);
363 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
364 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
365
366 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
367 temp2 = __lsx_vadd_h(delta0, p0_src);
368 temp2 = __lsx_vclip255_h(temp2);
369 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
370
371 temp2 = __lsx_vsub_h(q0_src, delta0);
372 temp2 = __lsx_vclip255_h(temp2);
373 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
374
375 tmp = (beta + (beta >> 1)) >> 3;
376 DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
377 cmp0, cmp1);
378 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
379 p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
380 DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
381 cmp0, cmp1);
382 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
383 q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
384
385 tc_pos = __lsx_vsrai_h(tc_pos, 1);
386 tc_neg = __lsx_vneg_h(tc_pos);
387
388 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
389 delta1, delta2);
390 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
391 delta1, delta2);
392 delta1 = __lsx_vadd_h(delta1, delta0);
393 delta2 = __lsx_vsub_h(delta2, delta0);
394 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
395 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
396 tc_pos, delta1, delta2);
397 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
398 delta1, delta2);
399 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
400 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
401 q1_src, q_is_pcm_vec, delta1, delta2);
402 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
403 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
404 q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
405 q0_src, abs_delta0, delta1, delta2, temp0, temp2);
406 /* weak filter ends */
407
408 /* pack weak results to 8 bit */
409 DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
410 dst3, dst4);
411 dst5 = __lsx_vpickev_b(q2_src, delta2);
412
413 /* select between weak or strong */
414 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
415 dst0, dst1);
416 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
417
418 /* pack src to 8 bit */
419 DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
420 dst3, dst4);
421 dst5 = __lsx_vpickev_b(q2_src, q1_src);
422
423 cmp3 = __lsx_vnor_v(cmp3, cmp3);
424 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
425 dst0, dst1);
426 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
427
428 __lsx_vstelm_d(dst0, p2, 0, 0);
429 __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
430 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
431 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
432 __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
433 __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
434 }
435 }
436 }
437
ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t beta, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)438 void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
439 int32_t beta, int32_t *tc,
440 uint8_t *p_is_pcm, uint8_t *q_is_pcm)
441 {
442 ptrdiff_t stride_2x = (stride << 1);
443 ptrdiff_t stride_4x = (stride << 2);
444 ptrdiff_t stride_3x = stride_2x + stride;
445 uint8_t *p3 = src;
446 uint8_t *p2 = src + stride_3x;
447 uint8_t *p1 = src + stride_4x;
448 uint8_t *p0 = src + stride_4x + stride_3x;
449 uint8_t flag0, flag1;
450 int32_t dp00, dq00, dp30, dq30, d00, d30;
451 int32_t d0030, d0434;
452 int32_t dp04, dq04, dp34, dq34, d04, d34;
453 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
454 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
455
456 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
457 __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
458 __m128i cmp3;
459 __m128i temp0, temp1;
460 __m128i temp2;
461 __m128i tc_pos, tc_neg;
462 __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
463 __m128i zero = {0};
464 __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
465
466 dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
467 dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
468 dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
469 dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
470 d00 = dp00 + dq00;
471 d30 = dp30 + dq30;
472 p_is_pcm0 = p_is_pcm[0];
473 q_is_pcm0 = q_is_pcm[0];
474
475 dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
476 dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
477 dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
478 dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
479 d04 = dp04 + dq04;
480 d34 = dp34 + dq34;
481 p_is_pcm4 = p_is_pcm[1];
482 q_is_pcm4 = q_is_pcm[1];
483
484 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
485 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
486 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
487
488 d0030 = (d00 + d30) >= beta;
489 d0434 = (d04 + d34) >= beta;
490
491 DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
492 cmp3 = __lsx_vpackev_d(cmp1, cmp0);
493 cmp3 = __lsx_vseqi_d(cmp3, 0);
494
495 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
496 (!d0030 || !d0434)) {
497 src -= 4;
498 DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
499 src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
500 src += stride_4x;
501 DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
502 src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
503 src -= stride_4x;
504
505 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
506 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
507 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
508
509 tc0 = tc[0];
510 beta30 = beta >> 3;
511 beta20 = beta >> 2;
512 tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
513 tc4 = tc[1];
514 tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
515 DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
516 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
517 LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
518 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
519 q0_src, q1_src, q2_src, q3_src);
520
521 flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
522 abs(p3[-1] - p3[0]) < tc250;
523 flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
524 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
525 (d30 << 1) < beta20);
526 cmp0 = __lsx_vreplgr2vr_d(flag0);
527 DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
528 p0_src, p3_src, p2_src, p1_src, p0_src);
529
530 flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
531 abs(p1[-1] - p1[0]) < tc254;
532 flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
533 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
534 (d34 << 1) < beta20);
535 DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
536 q3_src, q0_src, q1_src, q2_src, q3_src);
537
538 cmp1 = __lsx_vreplgr2vr_d(flag1);
539 cmp2 = __lsx_vpackev_d(cmp1, cmp0);
540 cmp2 = __lsx_vseqi_d(cmp2, 0);
541
542 if (flag0 && flag1) { /* strong only */
543 /* strong filter */
544 tc_neg = __lsx_vneg_h(tc_pos);
545 /* p part */
546 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
547 temp0, temp0);
548 temp1 = __lsx_vadd_h(p3_src, p2_src);
549 temp1 = __lsx_vslli_h(temp1, 1);
550 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
551 temp1 = __lsx_vsrari_h(temp1, 3);
552 temp2 = __lsx_vsub_h(temp1, p2_src);
553 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
554 dst0 = __lsx_vadd_h(temp2, p2_src);
555
556 temp1 = __lsx_vadd_h(temp0, p2_src);
557 temp1 = __lsx_vsrari_h(temp1, 2);
558 temp2 = __lsx_vsub_h(temp1, p1_src);
559 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
560 dst1 = __lsx_vadd_h(temp2, p1_src);
561
562 temp1 = __lsx_vslli_h(temp0, 1);
563 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
564 temp1 = __lsx_vsrari_h(temp1, 3);
565 temp2 = __lsx_vsub_h(temp1, p0_src);
566 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
567 dst2 = __lsx_vadd_h(temp2, p0_src);
568
569 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
570 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
571 p_is_pcm_vec, dst0, dst1);
572 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
573
574 /* q part */
575 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
576 temp0, temp0);
577 temp1 = __lsx_vadd_h(q3_src, q2_src);
578 temp1 = __lsx_vslli_h(temp1, 1);
579 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
580 temp1 = __lsx_vsrari_h(temp1, 3);
581 temp2 = __lsx_vsub_h(temp1, q2_src);
582 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
583 dst5 = __lsx_vadd_h(temp2, q2_src);
584
585 temp1 = __lsx_vadd_h(temp0, q2_src);
586 temp1 = __lsx_vsrari_h(temp1, 2);
587 temp2 = __lsx_vsub_h(temp1, q1_src);
588 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
589 dst4 = __lsx_vadd_h(temp2, q1_src);
590
591 temp1 = __lsx_vslli_h(temp0, 1);
592 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
593 temp1 = __lsx_vsrari_h(temp1, 3);
594 temp2 = __lsx_vsub_h(temp1, q0_src);
595 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
596 dst3 = __lsx_vadd_h(temp2, q0_src);
597
598 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
599 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
600 q_is_pcm_vec, dst3, dst4);
601 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
602 /* strong filter ends */
603 } else if (flag0 == flag1) { /* weak only */
604 /* weak filter */
605 tc_pos = __lsx_vsrai_h(tc_pos, 1);
606 tc_neg = __lsx_vneg_h(tc_pos);
607
608 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
609 diff0, diff1);
610 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
611 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
612 delta0 = __lsx_vsub_h(diff0, diff1);
613 delta0 = __lsx_vsrari_h(delta0, 4);
614 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
615 __lsx_vslli_h(tc_pos, 1));
616 abs_delta0 = __lsx_vadda_h(delta0, zero);
617 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
618 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
619
620 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
621 temp2 = __lsx_vadd_h(delta0, p0_src);
622 temp2 = __lsx_vclip255_h(temp2);
623 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
624 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
625
626 temp2 = __lsx_vsub_h(q0_src, delta0);
627 temp2 = __lsx_vclip255_h(temp2);
628 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
629 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
630
631 tmp = ((beta + (beta >> 1)) >> 3);
632 DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
633 !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
634 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
635 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
636
637 DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
638 (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
639 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
640 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
641 tc_pos = __lsx_vsrai_h(tc_pos, 1);
642 tc_neg = __lsx_vneg_h(tc_pos);
643
644 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
645 delta1, delta2);
646 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
647 delta1, delta2);
648 delta1 = __lsx_vadd_h(delta1, delta0);
649 delta2 = __lsx_vsub_h(delta2, delta0);
650 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
651 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
652 tc_pos, delta1, delta2);
653 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
654 delta1, delta2);
655 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
656 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
657 q1_src, q_is_pcm_vec, delta1, delta2);
658
659 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
660 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
661 p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
662 q1_src, abs_delta0, dst0, dst1, dst2, dst3);
663 /* weak filter ends */
664
665 cmp3 = __lsx_vnor_v(cmp3, cmp3);
666 DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
667 cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
668 dst0, dst1, dst2, dst3);
669 DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
670
671 /* transpose */
672 dst4 = __lsx_vilvl_b(dst1, dst0);
673 dst5 = __lsx_vilvh_b(dst1, dst0);
674 dst0 = __lsx_vilvl_h(dst5, dst4);
675 dst1 = __lsx_vilvh_h(dst5, dst4);
676
677 src += 2;
678 __lsx_vstelm_w(dst0, src, 0, 0);
679 __lsx_vstelm_w(dst0, src + stride, 0, 1);
680 __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
681 __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
682 src += stride_4x;
683 __lsx_vstelm_w(dst1, src, 0, 0);
684 __lsx_vstelm_w(dst1, src + stride, 0, 1);
685 __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
686 __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
687 return;
688 } else { /* strong + weak */
689 /* strong filter */
690 tc_neg = __lsx_vneg_h(tc_pos);
691
692 /* p part */
693 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
694 temp0, temp0);
695
696 temp1 = __lsx_vadd_h(p3_src, p2_src);
697 temp1 = __lsx_vslli_h(temp1, 1);
698 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
699 temp1 = __lsx_vsrari_h(temp1, 3);
700 temp2 = __lsx_vsub_h(temp1, p2_src);
701 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
702 dst0 = __lsx_vadd_h(temp2, p2_src);
703
704 temp1 = __lsx_vadd_h(temp0, p2_src);
705 temp1 = __lsx_vsrari_h(temp1, 2);
706 temp2 = __lsx_vsub_h(temp1, p1_src);
707 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
708 dst1 = __lsx_vadd_h(temp2, p1_src);
709
710 temp1 = __lsx_vslli_h(temp0, 1);
711 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
712 temp1 = __lsx_vsrari_h(temp1, 3);
713 temp2 = __lsx_vsub_h(temp1, p0_src);
714 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
715 dst2 = __lsx_vadd_h(temp2, p0_src);
716
717 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
718 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
719 p_is_pcm_vec, dst0, dst1);
720 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
721
722 /* q part */
723 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
724 temp1 = __lsx_vadd_h(q3_src, q2_src);
725 temp1 = __lsx_vslli_h(temp1, 1);
726 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
727 temp1 = __lsx_vsrari_h(temp1, 3);
728 temp2 = __lsx_vsub_h(temp1, q2_src);
729 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
730 dst5 = __lsx_vadd_h(temp2, q2_src);
731
732 temp1 = __lsx_vadd_h(temp0, q2_src);
733 temp1 = __lsx_vsrari_h(temp1, 2);
734 temp2 = __lsx_vsub_h(temp1, q1_src);
735 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
736 dst4 = __lsx_vadd_h(temp2, q1_src);
737
738 temp1 = __lsx_vslli_h(temp0, 1);
739 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
740 temp1 = __lsx_vsrari_h(temp1, 3);
741 temp2 = __lsx_vsub_h(temp1, q0_src);
742 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
743 dst3 = __lsx_vadd_h(temp2, q0_src);
744
745 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
746 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
747 q_is_pcm_vec, dst3, dst4);
748 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
749 /* strong filter ends */
750
751 /* weak filter */
752 tc_pos = __lsx_vsrai_h(tc_pos, 1);
753 tc_neg = __lsx_vneg_h(tc_pos);
754
755 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
756 diff0, diff1);
757 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
758 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
759 delta0 = __lsx_vsub_h(diff0, diff1);
760 delta0 = __lsx_vsrari_h(delta0, 4);
761
762 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
763 __lsx_vslli_h(tc_pos, 1));
764 abs_delta0 = __lsx_vadda_h(delta0, zero);
765 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
766 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
767 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
768 temp2 = __lsx_vadd_h(delta0, p0_src);
769 temp2 = __lsx_vclip255_h(temp2);
770 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
771 temp2 = __lsx_vsub_h(q0_src, delta0);
772 temp2 = __lsx_vclip255_h(temp2);
773 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
774
775 tmp = (beta + (beta >> 1)) >> 3;
776 DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
777 !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
778 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
779 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
780
781 DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
782 (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
783 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
784 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
785 tc_pos = __lsx_vsrai_h(tc_pos, 1);
786 tc_neg = __lsx_vneg_h(tc_pos);
787
788 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
789 delta1, delta2);
790 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
791 delta1, delta2);
792 delta1 = __lsx_vadd_h(delta1, delta0);
793 delta2 = __lsx_vsub_h(delta2, delta0);
794 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
795 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
796 tc_pos, delta1, delta2);
797 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
798 delta1, delta2);
799 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
800 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
801 q1_src, q_is_pcm_vec, delta1, delta2);
802
803 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
804 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
805 q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
806 q0_src, abs_delta0, delta1, delta2, temp0, temp2);
807 /* weak filter ends*/
808
809 /* select between weak or strong */
810 DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
811 cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
812 dst0, dst1, dst2, dst3);
813 DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
814 dst4, dst5);
815 }
816
817 cmp3 = __lsx_vnor_v(cmp3, cmp3);
818 DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
819 p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
820 DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
821 dst4, dst5);
822
823 /* pack results to 8 bit */
824 DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
825 dst5, dst0, dst1, dst2, dst3);
826
827 /* transpose */
828 DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
829 DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
830 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
831 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
832
833 src += 1;
834 __lsx_vstelm_w(dst0, src, 0, 0);
835 __lsx_vstelm_h(dst2, src, 4, 0);
836 src += stride;
837 __lsx_vstelm_w(dst0, src, 0, 1);
838 __lsx_vstelm_h(dst2, src, 4, 2);
839 src += stride;
840
841 __lsx_vstelm_w(dst0, src, 0, 2);
842 __lsx_vstelm_h(dst2, src, 4, 4);
843 src += stride;
844 __lsx_vstelm_w(dst0, src, 0, 3);
845 __lsx_vstelm_h(dst2, src, 4, 6);
846 src += stride;
847
848 __lsx_vstelm_w(dst1, src, 0, 0);
849 __lsx_vstelm_h(dst3, src, 4, 0);
850 src += stride;
851 __lsx_vstelm_w(dst1, src, 0, 1);
852 __lsx_vstelm_h(dst3, src, 4, 2);
853 src += stride;
854
855 __lsx_vstelm_w(dst1, src, 0, 2);
856 __lsx_vstelm_h(dst3, src, 4, 4);
857 src += stride;
858 __lsx_vstelm_w(dst1, src, 0, 3);
859 __lsx_vstelm_h(dst3, src, 4, 6);
860 }
861 }
862
ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)863 void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
864 int32_t *tc, uint8_t *p_is_pcm,
865 uint8_t *q_is_pcm)
866 {
867 uint8_t *p1_ptr = src - (stride << 1);
868 uint8_t *p0_ptr = src - stride;
869 uint8_t *q0_ptr = src;
870 uint8_t *q1_ptr = src + stride;
871 __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
872 __m128i p1, p0, q0, q1;
873 __m128i tc_pos, tc_neg;
874 __m128i zero = {0};
875 __m128i temp0, temp1, delta;
876
877 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
878 DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
879 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
880 tc_neg = __lsx_vneg_h(tc_pos);
881 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
882 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
883 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
884
885 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
886 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
887 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
888
889 DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
890 p1, p0, q0, q1);
891 DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
892 p1, p0, q0, q1);
893 DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
894 temp0 = __lsx_vslli_h(temp0, 2);
895 temp0 = __lsx_vadd_h(temp0, temp1);
896 delta = __lsx_vsrari_h(temp0, 3);
897 delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
898 temp0 = __lsx_vadd_h(p0, delta);
899 temp0 = __lsx_vclip255_h(temp0);
900 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
901 temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
902
903 temp1 = __lsx_vsub_h(q0, delta);
904 temp1 = __lsx_vclip255_h(temp1);
905 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
906 temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
907
908 tc_pos = __lsx_vslei_d(tc_pos, 0);
909 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
910 temp0, temp1);
911 temp0 = __lsx_vpickev_b(temp1, temp0);
912 __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
913 __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
914 }
915 }
916
ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)917 void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
918 int32_t *tc, uint8_t *p_is_pcm,
919 uint8_t *q_is_pcm)
920 {
921 ptrdiff_t stride_2x = (stride << 1);
922 ptrdiff_t stride_4x = (stride << 2);
923 ptrdiff_t stride_3x = stride_2x + stride;
924 __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
926 __m128i p1, p0, q0, q1;
927 __m128i tc_pos, tc_neg;
928 __m128i zero = {0};
929 __m128i temp0, temp1, delta;
930
931 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
932 DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
933 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
934 tc_neg = __lsx_vneg_h(tc_pos);
935
936 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
937 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
938 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
939 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
940 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
941 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
942
943 src -= 2;
944 DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
945 src + stride_3x, 0, src0, src1, src2, src3);
946 src += stride_4x;
947 DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
948 src + stride_3x, 0, src4, src5, src6, src7);
949 src -= stride_4x;
950 LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
951 p1, p0, q0, q1);
952 DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
953 p1, p0, q0, q1);
954
955 DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
956 temp0 = __lsx_vslli_h(temp0, 2);
957 temp0 = __lsx_vadd_h(temp0, temp1);
958 delta = __lsx_vsrari_h(temp0, 3);
959 delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
960
961 temp0 = __lsx_vadd_h(p0, delta);
962 temp1 = __lsx_vsub_h(q0, delta);
963 DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
964 DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
965 q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
966 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
967 q_is_pcm_vec, temp0, temp1);
968
969 tc_pos = __lsx_vslei_d(tc_pos, 0);
970 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
971 temp0, temp1);
972 temp0 = __lsx_vpackev_b(temp1, temp0);
973
974 src += 1;
975 __lsx_vstelm_h(temp0, src, 0, 0);
976 __lsx_vstelm_h(temp0, src + stride, 0, 1);
977 __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
978 __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
979 src += stride_4x;
980 __lsx_vstelm_h(temp0, src, 0, 4);
981 __lsx_vstelm_h(temp0, src + stride, 0, 5);
982 __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
983 __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
984 src -= stride_4x;
985 }
986 }
987
hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)988 static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst,
989 int32_t dst_stride,
990 uint8_t *src,
991 int32_t src_stride,
992 int16_t *sao_offset_val,
993 int32_t height)
994 {
995 const int32_t src_stride_2x = (src_stride << 1);
996 const int32_t dst_stride_2x = (dst_stride << 1);
997 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
998 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
999 __m128i edge_idx = {0x403000201, 0x0};
1000 __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1001 __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1002 __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
1003 __m128i const1 = __lsx_vldi(1);
1004 __m128i zero = {0};
1005
1006 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1007 src -= 1;
1008
1009 /* load in advance */
1010 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1011
1012 for (height -= 2; height; height -= 2) {
1013 src += src_stride_2x;
1014 src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1015 src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1016 src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1017
1018 DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1019 cmp_minus10, cmp_minus11);
1020 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1021 cmp_minus11, diff_minus10, diff_minus11);
1022 DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1023 cmp_minus10, cmp_minus11);
1024 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1025 cmp_minus11, cmp_minus10, cmp_minus11);
1026 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1027 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1028
1029 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1030 offset = __lsx_vaddi_bu(offset, 2);
1031
1032 /* load in advance */
1033 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1034 src_minus10, src_minus11);
1035 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
1036 sao_offset, sao_offset, offset, offset, offset);
1037 src0 = __lsx_vxori_b(src0, 128);
1038 dst0 = __lsx_vsadd_b(src0, offset);
1039 dst0 = __lsx_vxori_b(dst0, 128);
1040
1041 __lsx_vstelm_w(dst0, dst, 0, 0);
1042 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1043 dst += dst_stride_2x;
1044 }
1045
1046 src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1047 src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1048 src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1049
1050 DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1051 cmp_minus11);
1052 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1053 diff_minus10, diff_minus11);
1054 DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1055 cmp_minus11);
1056 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1057 cmp_minus10, cmp_minus11);
1058 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1059 const1, cmp_minus11, diff_minus10, diff_minus11);
1060
1061 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1062 offset = __lsx_vaddi_bu(offset, 2);
1063 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
1064 offset, offset, offset);
1065 src0 = __lsx_vxori_b(src0, 128);
1066 dst0 = __lsx_vsadd_b(src0, offset);
1067 dst0 = __lsx_vxori_b(dst0, 128);
1068
1069 __lsx_vstelm_w(dst0, dst, 0, 0);
1070 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1071 }
1072
hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1073 static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst,
1074 int32_t dst_stride,
1075 uint8_t *src,
1076 int32_t src_stride,
1077 int16_t *sao_offset_val,
1078 int32_t height)
1079 {
1080 const int32_t src_stride_2x = (src_stride << 1);
1081 const int32_t dst_stride_2x = (dst_stride << 1);
1082 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1083 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1084 __m128i edge_idx = {0x403000201, 0x0};
1085 __m128i const1 = __lsx_vldi(1);
1086 __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1087 __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1088 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1089 __m128i zeros = {0};
1090
1091 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1092 src -= 1;
1093
1094 /* load in advance */
1095 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1096
1097 for (height -= 2; height; height -= 2) {
1098 src += src_stride_2x;
1099 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
1100 src_minus11, shuf1, src0, src1);
1101 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
1102 src_minus11, shuf2, src_plus10, src_plus11);
1103 DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1104 src_plus10, src_minus10, src_plus10);
1105 src0 = __lsx_vpickev_d(src1, src0);
1106
1107 DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1108 cmp_minus10, cmp_minus11);
1109 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1110 cmp_minus11, diff_minus10, diff_minus11);
1111 DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1112 cmp_minus10, cmp_minus11);
1113 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1114 cmp_minus11, cmp_minus10, cmp_minus11);
1115 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1116 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1117
1118 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1119 offset = __lsx_vaddi_bu(offset, 2);
1120
1121 /* load in advance */
1122 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1123 src_minus10, src_minus11);
1124 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1125 sao_offset, offset, offset, offset);
1126 src0 = __lsx_vxori_b(src0, 128);
1127 dst0 = __lsx_vsadd_b(src0, offset);
1128 dst0 = __lsx_vxori_b(dst0, 128);
1129
1130 __lsx_vstelm_d(dst0, dst, 0, 0);
1131 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1132 dst += dst_stride_2x;
1133 }
1134
1135 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
1136 shuf1, src0, src1);
1137 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
1138 shuf2, src_plus10, src_plus11);
1139 DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1140 src_plus10, src_minus10, src_plus10);
1141 src0 = __lsx_vpickev_d(src1, src0);
1142
1143 DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1144 cmp_minus11);
1145 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1146 diff_minus10, diff_minus11);
1147 DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1148 cmp_minus11);
1149 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1150 cmp_minus10, cmp_minus11);
1151 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1152 const1, cmp_minus11, diff_minus10, diff_minus11);
1153
1154 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1155 offset = __lsx_vaddi_bu(offset, 2);
1156 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1157 sao_offset, offset, offset, offset);
1158 src0 = __lsx_vxori_b(src0, 128);
1159 dst0 = __lsx_vsadd_b(src0, offset);
1160 dst0 = __lsx_vxori_b(dst0, 128);
1161
1162 __lsx_vstelm_d(dst0, dst, 0, 0);
1163 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1164 }
1165
hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)1166 static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst,
1167 int32_t dst_stride,
1168 uint8_t *src,
1169 int32_t src_stride,
1170 int16_t *sao_offset_val,
1171 int32_t width,
1172 int32_t height)
1173 {
1174 uint8_t *dst_ptr, *src_minus1;
1175 int32_t v_cnt;
1176 const int32_t src_stride_2x = (src_stride << 1);
1177 const int32_t dst_stride_2x = (dst_stride << 1);
1178 const int32_t src_stride_4x = (src_stride << 2);
1179 const int32_t dst_stride_4x = (dst_stride << 2);
1180 const int32_t src_stride_3x = src_stride_2x + src_stride;
1181 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1182
1183 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1184 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1185 __m128i edge_idx = {0x403000201, 0x0};
1186 __m128i const1 = __lsx_vldi(1);
1187 __m128i sao_offset;
1188 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1189 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1190 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1191 __m128i diff_plus13;
1192 __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1193 __m128i src_minus10, src_minus11, src_minus12, src_minus13;
1194 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1195 __m128i src_zero0, src_zero1, src_zero2, src_zero3;
1196 __m128i src_plus10, src_plus11, src_plus12, src_plus13;
1197
1198 sao_offset = __lsx_vld(sao_offset_val, 0);
1199 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1200
1201 for (; height; height -= 4) {
1202 src_minus1 = src - 1;
1203 src_minus10 = __lsx_vld(src_minus1, 0);
1204 DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1205 src_stride_2x, src_minus11, src_minus12);
1206 src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
1207
1208 for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1209 src_minus1 += 16;
1210 dst_ptr = dst + v_cnt;
1211 src10 = __lsx_vld(src_minus1, 0);
1212 DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1213 src_stride_2x, src11, src12);
1214 src13 = __lsx_vldx(src_minus1, src_stride_3x);
1215 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
1216 src_minus11, shuf1, src12, src_minus12, shuf1, src13,
1217 src_minus13, shuf1, src_zero0, src_zero1,
1218 src_zero2, src_zero3);
1219 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
1220 src_minus11, shuf2, src12, src_minus12, shuf2, src13,
1221 src_minus13, shuf2, src_plus10, src_plus11,
1222 src_plus12, src_plus13);
1223 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1224 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1225 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1226 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1227 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1228 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1229 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1230 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1231 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1232 diff_plus11);
1233 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1234 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1235 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1236 diff_plus13);
1237 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1238 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1239 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1240 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1241 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1242 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1243 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1244 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1245 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1246 cmp_plus11);
1247 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1248 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1249 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1250 cmp_plus13);
1251 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1252 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1253 cmp_minus11, diff_plus11, const1, cmp_plus11,
1254 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1255 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1256 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1257 cmp_minus13, diff_plus13, const1, cmp_plus13,
1258 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1259
1260 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1261 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1262 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1263 offset_mask3);
1264 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1265 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1266 offset_mask1, offset_mask2, offset_mask3);
1267 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1268 sao_offset, sao_offset, offset_mask0, offset_mask0,
1269 offset_mask0);
1270 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1271 sao_offset, sao_offset, offset_mask1, offset_mask1,
1272 offset_mask1);
1273 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1274 sao_offset, sao_offset, offset_mask2, offset_mask2,
1275 offset_mask2);
1276 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1277 sao_offset, sao_offset, offset_mask3, offset_mask3,
1278 offset_mask3);
1279
1280 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
1281 src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
1282 src_zero2, src_zero3);
1283 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1284 offset_mask1, src_zero2, offset_mask2, src_zero3,
1285 offset_mask3, dst0, dst1, dst2, dst3);
1286 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1287 128, dst0, dst1, dst2, dst3);
1288
1289 src_minus10 = src10;
1290 src_minus11 = src11;
1291 src_minus12 = src12;
1292 src_minus13 = src13;
1293
1294 __lsx_vst(dst0, dst_ptr, 0);
1295 __lsx_vst(dst1, dst_ptr + dst_stride, 0);
1296 __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
1297 __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
1298 }
1299 src += src_stride_4x;
1300 dst += dst_stride_4x;
1301 }
1302 }
1303
hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1304 static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst,
1305 int32_t dst_stride,
1306 uint8_t *src,
1307 int32_t src_stride,
1308 int16_t *sao_offset_val,
1309 int32_t height)
1310 {
1311 const int32_t src_stride_2x = (src_stride << 1);
1312 const int32_t dst_stride_2x = (dst_stride << 1);
1313 __m128i edge_idx = {0x403000201, 0x0};
1314 __m128i const1 = __lsx_vldi(1);
1315 __m128i dst0;
1316 __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1317 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1318 __m128i src_minus10, src_minus11, src10, src11;
1319 __m128i src_zero0, src_zero1;
1320 __m128i offset;
1321 __m128i offset_mask0, offset_mask1;
1322
1323 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1324
1325 /* load in advance */
1326 DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
1327 src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
1328
1329 for (height -= 2; height; height -= 2) {
1330 src += src_stride_2x;
1331 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1332 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1333 src_minus11, src_zero1);
1334 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1335 cmp_minus10, cmp_minus11);
1336 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1337 cmp_minus11, diff_minus10, diff_minus11);
1338 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1339 src_minus11, cmp_minus10, cmp_minus11);
1340 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1341 cmp_minus11, cmp_minus10, cmp_minus11);
1342 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1343 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1344
1345 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1346 diff_minus11, offset_mask0, offset_mask1);
1347 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1348 offset_mask0, offset_mask1);
1349 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1350 src_zero0, offset, dst0);
1351 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1352 sao_offset, offset, offset, offset);
1353
1354 dst0 = __lsx_vxori_b(dst0, 128);
1355 dst0 = __lsx_vsadd_b(dst0, offset);
1356 dst0 = __lsx_vxori_b(dst0, 128);
1357 src_minus10 = src10;
1358 src_minus11 = src11;
1359
1360 /* load in advance */
1361 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1362 src10, src11);
1363
1364 __lsx_vstelm_w(dst0, dst, 0, 0);
1365 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1366 dst += dst_stride_2x;
1367 }
1368
1369 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1370 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1371 src_minus11, src_zero1);
1372 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1373 cmp_minus10, cmp_minus11);
1374 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1375 diff_minus10, diff_minus11);
1376 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1377 cmp_minus10, cmp_minus11);
1378 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1379 cmp_minus10, cmp_minus11);
1380 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1381 const1, cmp_minus11, diff_minus10, diff_minus11);
1382
1383 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1384 diff_minus11, offset_mask0, offset_mask1);
1385 DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1386 offset_mask0, offset_mask1);
1387 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1388 src_zero0, offset, dst0);
1389 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1390 sao_offset, offset, offset, offset);
1391 dst0 = __lsx_vxori_b(dst0, 128);
1392 dst0 = __lsx_vsadd_b(dst0, offset);
1393 dst0 = __lsx_vxori_b(dst0, 128);
1394
1395 __lsx_vstelm_w(dst0, dst, 0, 0);
1396 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1397 }
1398
hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1399 static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst,
1400 int32_t dst_stride,
1401 uint8_t *src,
1402 int32_t src_stride,
1403 int16_t *sao_offset_val,
1404 int32_t height)
1405 {
1406 const int32_t src_stride_2x = (src_stride << 1);
1407 const int32_t dst_stride_2x = (dst_stride << 1);
1408 __m128i edge_idx = {0x403000201, 0x0};
1409 __m128i const1 = __lsx_vldi(1);
1410 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1411 __m128i src_zero0, src_zero1, dst0;
1412 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1413 __m128i src_minus10, src_minus11, src10, src11;
1414 __m128i offset_mask0, offset_mask1;
1415
1416 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1417
1418 /* load in advance */
1419 DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
1420 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
1421
1422 for (height -= 2; height; height -= 2) {
1423 src += src_stride_2x;
1424 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1425 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1426 src_minus11, src_zero1);
1427 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1428 cmp_minus10, cmp_minus11);
1429 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1430 cmp_minus11, diff_minus10, diff_minus11);
1431 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1432 src_minus11, cmp_minus10, cmp_minus11);
1433 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1434 cmp_minus11, cmp_minus10, cmp_minus11);
1435 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1436 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1437
1438 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1439 diff_minus11, offset_mask0, offset_mask1);
1440 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1441 offset_mask0, offset_mask1);
1442 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1443 src_zero0, offset, dst0);
1444 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1445 sao_offset, offset, offset, offset);
1446
1447 dst0 = __lsx_vxori_b(dst0, 128);
1448 dst0 = __lsx_vsadd_b(dst0, offset);
1449 dst0 = __lsx_vxori_b(dst0, 128);
1450 src_minus10 = src10;
1451 src_minus11 = src11;
1452
1453 /* load in advance */
1454 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1455 src10, src11);
1456
1457 __lsx_vstelm_d(dst0, dst, 0, 0);
1458 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1459 dst += dst_stride_2x;
1460 }
1461
1462 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1463 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1464 src_minus11, src_zero1);
1465 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1466 cmp_minus10, cmp_minus11);
1467 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1468 diff_minus10, diff_minus11);
1469 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1470 cmp_minus10, cmp_minus11);
1471 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1472 cmp_minus10, cmp_minus11);
1473 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1474 const1, cmp_minus11, diff_minus10, diff_minus11);
1475
1476 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1477 diff_minus11, offset_mask0, offset_mask1);
1478 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1479 offset_mask0, offset_mask1);
1480 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1481 src_zero0, offset, dst0);
1482 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1483 sao_offset, offset, offset, offset);
1484 dst0 = __lsx_vxori_b(dst0, 128);
1485 dst0 = __lsx_vsadd_b(dst0, offset);
1486 dst0 = __lsx_vxori_b(dst0, 128);
1487
1488 __lsx_vstelm_d(dst0, dst, 0, 0);
1489 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1490 }
1491
hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t * sao_offset_val, int32_t width, int32_t height)1492 static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst,
1493 int32_t dst_stride,
1494 uint8_t *src,
1495 int32_t src_stride,
1496 int16_t *
1497 sao_offset_val,
1498 int32_t width,
1499 int32_t height)
1500 {
1501 uint8_t *src_orig = src;
1502 uint8_t *dst_orig = dst;
1503 int32_t h_cnt, v_cnt;
1504 const int32_t src_stride_2x = (src_stride << 1);
1505 const int32_t dst_stride_2x = (dst_stride << 1);
1506 const int32_t src_stride_4x = (src_stride << 2);
1507 const int32_t dst_stride_4x = (dst_stride << 2);
1508 const int32_t src_stride_3x = src_stride_2x + src_stride;
1509 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1510 __m128i edge_idx = {0x403000201, 0x0};
1511 __m128i const1 = __lsx_vldi(1);
1512 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1513 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1514 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1515 __m128i diff_plus13;
1516 __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1517 __m128i src12, dst2, src13, dst3;
1518 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1519
1520 sao_offset = __lsx_vld(sao_offset_val, 0);
1521 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1522
1523 for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1524 src = src_orig + v_cnt;
1525 dst = dst_orig + v_cnt;
1526
1527 DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
1528 src_minus10, src_minus11);
1529
1530 for (h_cnt = (height >> 2); h_cnt--;) {
1531 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1532 src, src_stride_3x, src, src_stride_4x,
1533 src10, src11, src12, src13);
1534 DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
1535 src10, src10, src_minus11, src10, src11, cmp_minus10,
1536 cmp_plus10, cmp_minus11, cmp_plus11);
1537 DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
1538 src12, src13, cmp_minus12, cmp_plus12,
1539 cmp_minus13, cmp_plus13);
1540 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1541 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1542 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1543 diff_plus11);
1544 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1545 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1546 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1547 diff_plus13);
1548 DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
1549 src10, src10, src_minus11, src10, src11, cmp_minus10,
1550 cmp_plus10, cmp_minus11, cmp_plus11);
1551 DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
1552 src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
1553 cmp_plus13);
1554 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1555 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1556 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1557 cmp_plus11);
1558 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1559 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1560 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1561 cmp_plus13);
1562 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1563 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1564 cmp_minus11, diff_plus11, const1, cmp_plus11,
1565 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1566 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1567 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1568 cmp_minus13, diff_plus13, const1, cmp_plus13,
1569 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1570
1571 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1572 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1573 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1574 offset_mask3);
1575 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1576 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1577 offset_mask1, offset_mask2, offset_mask3);
1578 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1579 sao_offset, sao_offset, offset_mask0,\
1580 offset_mask0, offset_mask0);
1581 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1582 sao_offset, sao_offset, offset_mask1, offset_mask1,
1583 offset_mask1);
1584 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1585 sao_offset, sao_offset, offset_mask2, offset_mask2,
1586 offset_mask2);
1587 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1588 sao_offset, sao_offset, offset_mask3, offset_mask3,
1589 offset_mask3);
1590
1591 src_minus10 = src12;
1592 DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
1593 src12, 128, src_minus11, src10, src11, src12);
1594 DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
1595 offset_mask1, src11, offset_mask2, src12,
1596 offset_mask3, dst0, dst1, dst2, dst3);
1597 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1598 128, dst0, dst1, dst2, dst3);
1599 src_minus11 = src13;
1600
1601 __lsx_vst(dst0, dst, 0);
1602 __lsx_vstx(dst1, dst, dst_stride);
1603 __lsx_vstx(dst2, dst, dst_stride_2x);
1604 __lsx_vstx(dst3, dst, dst_stride_3x);
1605 src += src_stride_4x;
1606 dst += dst_stride_4x;
1607 }
1608 }
1609 }
1610
hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1611 static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst,
1612 int32_t dst_stride,
1613 uint8_t *src,
1614 int32_t src_stride,
1615 int16_t *sao_offset_val,
1616 int32_t height)
1617 {
1618 uint8_t *src_orig;
1619 const int32_t src_stride_2x = (src_stride << 1);
1620 const int32_t dst_stride_2x = (dst_stride << 1);
1621 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1622 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1623 __m128i edge_idx = {0x403000201, 0x0};
1624 __m128i const1 = __lsx_vldi(1);
1625 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1626 __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1627 __m128i src_minus11, src10, src11;
1628 __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1629 __m128i offset_mask0, offset_mask1;
1630 __m128i zeros = {0};
1631
1632 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1633 src_orig = src - 1;
1634
1635 /* load in advance */
1636 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
1637 src_minus10, src_minus11);
1638 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1639 src10, src11);
1640
1641 for (height -= 2; height; height -= 2) {
1642 src_orig += src_stride_2x;
1643
1644 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1645 shuf1, src_zero0, src_zero1);
1646 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1647 src_plus0, src_plus1);
1648
1649 DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
1650 src_minus11, src_minus10, src_minus11);
1651 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
1652 src_zero1, src_zero0, src_zero1);
1653 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
1654 src_minus11, cmp_minus10, cmp_minus11);
1655 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1656 cmp_minus11, diff_minus10, diff_minus11);
1657 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1658 src_minus11, cmp_minus10, cmp_minus11);
1659 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1660 cmp_minus11, cmp_minus10, cmp_minus11);
1661 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1662 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1663
1664 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1665 diff_minus11, offset_mask0, offset_mask1);
1666 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1667 offset_mask0, offset_mask1);
1668 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1669 src_zero0, offset, dst0);
1670 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1671 sao_offset, offset, offset, offset);
1672 dst0 = __lsx_vxori_b(dst0, 128);
1673 dst0 = __lsx_vsadd_b(dst0, offset);
1674 dst0 = __lsx_vxori_b(dst0, 128);
1675
1676 src_minus10 = src10;
1677 src_minus11 = src11;
1678
1679 /* load in advance */
1680 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1681 src10, src11);
1682
1683 __lsx_vstelm_w(dst0, dst, 0, 0);
1684 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1685 dst += dst_stride_2x;
1686 }
1687
1688 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1689 src_zero0, src_zero1);
1690 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1691 src_plus0, src_plus1);
1692
1693 DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
1694 src_minus10, src_minus11);
1695 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1696 src_zero0, src_zero1);
1697 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1698 cmp_minus10, cmp_minus11);
1699 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1700 diff_minus10, diff_minus11);
1701 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1702 cmp_minus10, cmp_minus11);
1703 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1704 cmp_minus10, cmp_minus11);
1705 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1706 const1, cmp_minus11, diff_minus10, diff_minus11);
1707
1708 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1709 diff_minus11, offset_mask0, offset_mask1);
1710 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1711 offset_mask1);
1712 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1713 src_zero0, offset, dst0);
1714 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1715 sao_offset, offset, offset, offset);
1716 dst0 = __lsx_vxori_b(dst0, 128);
1717 dst0 = __lsx_vsadd_b(dst0, offset);
1718 dst0 = __lsx_vxori_b(dst0, 128);
1719
1720 __lsx_vstelm_w(dst0, dst, 0, 0);
1721 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1722 }
1723
hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1724 static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst,
1725 int32_t dst_stride,
1726 uint8_t *src,
1727 int32_t src_stride,
1728 int16_t *sao_offset_val,
1729 int32_t height)
1730 {
1731 uint8_t *src_orig;
1732 const int32_t src_stride_2x = (src_stride << 1);
1733 const int32_t dst_stride_2x = (dst_stride << 1);
1734 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1735 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1736 __m128i edge_idx = {0x403000201, 0x0};
1737 __m128i const1 = __lsx_vldi(1);
1738 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1739 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1740 __m128i src_minus10, src10, src_minus11, src11;
1741 __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1742 __m128i offset_mask0, offset_mask1;
1743 __m128i zeros = {0};
1744
1745 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1746 src_orig = src - 1;
1747
1748 /* load in advance */
1749 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
1750 src_minus11);
1751 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1752 src10, src11);
1753
1754 for (height -= 2; height; height -= 2) {
1755 src_orig += src_stride_2x;
1756
1757 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1758 shuf1, src_zero0, src_zero1);
1759 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1760 src_plus10, src_plus11);
1761
1762 DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
1763 src_minus11, src_minus10, src_minus11);
1764 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1765 src_zero0, src_zero1);
1766 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1767 cmp_minus10, cmp_minus11);
1768 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1769 cmp_minus11, diff_minus10, diff_minus11);
1770 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1771 src_minus11, cmp_minus10, cmp_minus11);
1772 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1773 cmp_minus11, cmp_minus10, cmp_minus11);
1774 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1775 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1776
1777 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1778 diff_minus11, offset_mask0, offset_mask1);
1779 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1780 offset_mask0, offset_mask1);
1781 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1782 src_zero0, offset, dst0);
1783 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1784 sao_offset, offset, offset, offset);
1785 dst0 = __lsx_vxori_b(dst0, 128);
1786 dst0 = __lsx_vsadd_b(dst0, offset);
1787 dst0 = __lsx_vxori_b(dst0, 128);
1788
1789 src_minus10 = src10;
1790 src_minus11 = src11;
1791
1792 /* load in advance */
1793 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1794 src10, src11)
1795 __lsx_vstelm_d(dst0, dst, 0, 0);
1796 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1797 dst += dst_stride_2x;
1798 }
1799
1800 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1801 src_zero0, src_zero1);
1802 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1803 src_plus10, src_plus11);
1804 DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
1805 src_minus10, src_minus11);
1806 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1807 src_zero0, src_zero1);
1808
1809 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1810 cmp_minus10, cmp_minus11);
1811 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1812 cmp_minus11, diff_minus10, diff_minus11);
1813 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1814 cmp_minus10, cmp_minus11);
1815 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1816 cmp_minus10, cmp_minus11);
1817 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1818 const1, cmp_minus11, diff_minus10, diff_minus11);
1819
1820 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1821 diff_minus11, offset_mask0, offset_mask1);
1822 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1823 offset_mask1);
1824 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1825 src_zero0, offset, dst0);
1826 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1827 sao_offset, offset, offset, offset);
1828 dst0 = __lsx_vxori_b(dst0, 128);
1829 dst0 = __lsx_vsadd_b(dst0, offset);
1830 dst0 = __lsx_vxori_b(dst0, 128);
1831
1832 src_minus10 = src10;
1833 src_minus11 = src11;
1834
1835 /* load in advance */
1836 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1837 src10, src11);
1838
1839 __lsx_vstelm_d(dst0, dst, 0, 0);
1840 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1841 }
1842
hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t * sao_offset_val, int32_t width, int32_t height)1843 static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst,
1844 int32_t dst_stride,
1845 uint8_t *src,
1846 int32_t src_stride,
1847 int16_t *
1848 sao_offset_val,
1849 int32_t width,
1850 int32_t height)
1851 {
1852 uint8_t *src_orig = src;
1853 uint8_t *dst_orig = dst;
1854 int32_t v_cnt;
1855 const int32_t src_stride_2x = (src_stride << 1);
1856 const int32_t dst_stride_2x = (dst_stride << 1);
1857 const int32_t src_stride_4x = (src_stride << 2);
1858 const int32_t dst_stride_4x = (dst_stride << 2);
1859 const int32_t src_stride_3x = src_stride_2x + src_stride;
1860 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1861
1862 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1863 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1864 __m128i edge_idx = {0x403000201, 0x0};
1865 __m128i const1 = __lsx_vldi(1);
1866 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1867 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1868 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1869 __m128i diff_plus13, src_minus14, src_plus13;
1870 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1871 __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1872 __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
1873 __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
1874 __m128i src_zero3, sao_offset, src_plus12;
1875
1876 sao_offset = __lsx_vld(sao_offset_val, 0);
1877 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1878
1879 for (; height; height -= 4) {
1880 src_orig = src - 1;
1881 dst_orig = dst;
1882 src_minus11 = __lsx_vld(src_orig, 0);
1883 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1884 src_minus12, src_minus13);
1885 src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
1886
1887 for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1888 src_minus10 = __lsx_vld(src_orig - src_stride, 0);
1889 src_orig += 16;
1890 src10 = __lsx_vld(src_orig, 0);
1891 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
1892 src_stride_2x, src11, src12);
1893 src13 = __lsx_vldx(src_orig, src_stride_3x);
1894 src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
1895
1896 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
1897 src_minus12, shuf1, src12, src_minus13, shuf1,
1898 src13, src_minus14, shuf1, src_zero0, src_zero1,
1899 src_zero2, src_zero3);
1900 DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
1901 src_minus13, shuf2, src_plus10, src_plus11);
1902 src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
1903
1904 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1905 src_plus10, src_zero1, src_minus11, src_zero1,
1906 src_plus11, cmp_minus10, cmp_plus10,
1907 cmp_minus11, cmp_plus11);
1908 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1909 src_plus12, src_zero3, src_minus13, src_zero3,
1910 src_plus13, cmp_minus12, cmp_plus12,
1911 cmp_minus13, cmp_plus13);
1912 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1913 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1914 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1915 diff_plus11);
1916 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1917 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1918 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1919 diff_plus13);
1920 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1921 src_plus10, src_zero1, src_minus11, src_zero1,
1922 src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1923 cmp_plus11);
1924 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1925 src_plus12, src_zero3, src_minus13, src_zero3,
1926 src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1927 cmp_plus13);
1928 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1929 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1930 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1931 cmp_plus11);
1932 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1933 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1934 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1935 cmp_plus13);
1936 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1937 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1938 cmp_minus11, diff_plus11, const1, cmp_plus11,
1939 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1940 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1941 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1942 cmp_minus13, diff_plus13, const1, cmp_plus13,
1943 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1944
1945 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1946 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1947 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1948 offset_mask3);
1949 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1950 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1951 offset_mask1, offset_mask2, offset_mask3);
1952
1953 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1954 sao_offset, sao_offset, offset_mask0, offset_mask0,
1955 offset_mask0);
1956 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1957 sao_offset, sao_offset, offset_mask1, offset_mask1,
1958 offset_mask1);
1959 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1960 sao_offset, sao_offset, offset_mask2, offset_mask2,
1961 offset_mask2);
1962 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1963 sao_offset, sao_offset, offset_mask3, offset_mask3,
1964 offset_mask3);
1965
1966 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
1967 128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
1968 src_zero3);
1969 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1970 offset_mask1, src_zero2, offset_mask2, src_zero3,
1971 offset_mask3, dst0, dst1, dst2, dst3);
1972 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1973 128, dst0, dst1, dst2, dst3);
1974
1975 src_minus11 = src10;
1976 src_minus12 = src11;
1977 src_minus13 = src12;
1978 src_minus14 = src13;
1979
1980 __lsx_vst(dst0, dst_orig, 0);
1981 __lsx_vstx(dst1, dst_orig, dst_stride);
1982 __lsx_vstx(dst2, dst_orig, dst_stride_2x);
1983 __lsx_vstx(dst3, dst_orig, dst_stride_3x);
1984 dst_orig += 16;
1985 }
1986 src += src_stride_4x;
1987 dst += dst_stride_4x;
1988 }
1989 }
1990
hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)1991 static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst,
1992 int32_t dst_stride,
1993 uint8_t *src,
1994 int32_t src_stride,
1995 int16_t *sao_offset_val,
1996 int32_t height)
1997 {
1998 uint8_t *src_orig;
1999 const int32_t src_stride_2x = (src_stride << 1);
2000 const int32_t dst_stride_2x = (dst_stride << 1);
2001
2002 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2003 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2004 __m128i edge_idx = {0x403000201, 0x0};
2005 __m128i const1 = __lsx_vldi(1);
2006 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2007 __m128i src_zero0, src_zero1, dst0;
2008 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2009 __m128i src_minus10, src10, src_minus11, src11;
2010 __m128i offset_mask0, offset_mask1;
2011 __m128i zeros = {0};
2012
2013 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2014 src_orig = src - 1;
2015
2016 /* load in advance */
2017 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2018 src_minus10, src_minus11);
2019 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2020 src10, src11);
2021
2022 for (height -= 2; height; height -= 2) {
2023 src_orig += src_stride_2x;
2024
2025 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2026 shuf1, src_zero0, src_zero1);
2027 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2028 shuf2, src_minus10, src_minus11);
2029
2030 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2031 src_minus10, src_minus11);
2032 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2033 src_zero0, src_zero1);
2034 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2035 cmp_minus10, cmp_minus11);
2036 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2037 cmp_minus11, diff_minus10, diff_minus11);
2038 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2039 src_minus11, cmp_minus10, cmp_minus11);
2040 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2041 cmp_minus11, cmp_minus10, cmp_minus11);
2042 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2043 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2044
2045 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2046 diff_minus11, offset_mask0, offset_mask1);
2047 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2048 offset_mask0, offset_mask1);
2049 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2050 src_zero0, offset, dst0);
2051 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2052 sao_offset, offset, offset, offset);
2053 dst0 = __lsx_vxori_b(dst0, 128);
2054 dst0 = __lsx_vsadd_b(dst0, offset);
2055 dst0 = __lsx_vxori_b(dst0, 128);
2056
2057 src_minus10 = src10;
2058 src_minus11 = src11;
2059
2060 /* load in advance */
2061 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2062 src10, src11);
2063
2064 __lsx_vstelm_w(dst0, dst, 0, 0);
2065 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2066 dst += dst_stride_2x;
2067 }
2068
2069 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2070 src_zero0, src_zero1);
2071 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2072 shuf2, src_minus10, src_minus11);
2073
2074 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2075 src_minus10, src_minus11);
2076 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2077 src_zero0, src_zero1);
2078 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2079 cmp_minus10, cmp_minus11);
2080 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2081 cmp_minus11, diff_minus10, diff_minus11);
2082 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2083 cmp_minus10, cmp_minus11);
2084 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2085 cmp_minus10, cmp_minus11);
2086 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2087 const1, cmp_minus11, diff_minus10, diff_minus11);
2088
2089 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2090 diff_minus11, offset_mask0, offset_mask1);
2091 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2092 offset_mask1);
2093 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2094 src_zero0, offset, dst0);
2095 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2096 sao_offset, offset, offset, offset);
2097 dst0 = __lsx_vxori_b(dst0, 128);
2098 dst0 = __lsx_vsadd_b(dst0, offset);
2099 dst0 = __lsx_vxori_b(dst0, 128);
2100
2101 __lsx_vstelm_w(dst0, dst, 0, 0);
2102 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2103 dst += dst_stride_2x;
2104 }
2105
hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)2106 static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst,
2107 int32_t dst_stride,
2108 uint8_t *src,
2109 int32_t src_stride,
2110 int16_t *sao_offset_val,
2111 int32_t height)
2112 {
2113 uint8_t *src_orig;
2114 const int32_t src_stride_2x = (src_stride << 1);
2115 const int32_t dst_stride_2x = (dst_stride << 1);
2116
2117 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2118 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2119 __m128i edge_idx = {0x403000201, 0x0};
2120 __m128i const1 = __lsx_vldi(1);
2121 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2122 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2123 __m128i src_minus10, src10, src_minus11, src11;
2124 __m128i src_zero0, src_zero1, dst0;
2125 __m128i offset_mask0, offset_mask1;
2126 __m128i zeros = {0};
2127
2128 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2129 src_orig = src - 1;
2130
2131 /* load in advance */
2132 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2133 src_minus10, src_minus11);
2134 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2135 src10, src11);
2136
2137 for (height -= 2; height; height -= 2) {
2138 src_orig += src_stride_2x;
2139
2140 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2141 shuf1, src_zero0, src_zero1);
2142 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2143 shuf2, src_minus10, src_minus11);
2144
2145 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2146 src_minus10, src_minus11);
2147 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2148 src_zero0, src_zero1);
2149 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2150 cmp_minus10, cmp_minus11);
2151 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2152 cmp_minus11, diff_minus10, diff_minus11);
2153 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2154 src_minus11, cmp_minus10, cmp_minus11);
2155 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2156 cmp_minus11, cmp_minus10, cmp_minus11);
2157 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2158 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2159
2160 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2161 diff_minus11, offset_mask0, offset_mask1);
2162 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2163 offset_mask0, offset_mask1);
2164 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2165 src_zero0, offset, dst0);
2166 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2167 sao_offset, offset, offset, offset);
2168 dst0 = __lsx_vxori_b(dst0, 128);
2169 dst0 = __lsx_vsadd_b(dst0, offset);
2170 dst0 = __lsx_vxori_b(dst0, 128);
2171
2172 src_minus10 = src10;
2173 src_minus11 = src11;
2174
2175 /* load in advance */
2176 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2177 src10, src11);
2178
2179 __lsx_vstelm_d(dst0, dst, 0, 0);
2180 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2181 dst += dst_stride_2x;
2182 }
2183
2184 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2185 src_zero0, src_zero1);
2186 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2187 shuf2, src_minus10, src_minus11);
2188
2189 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2190 src_minus10, src_minus11);
2191 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2192 src_zero0, src_zero1);
2193 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2194 cmp_minus10, cmp_minus11);
2195 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2196 diff_minus10, diff_minus11);
2197 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2198 cmp_minus10, cmp_minus11);
2199 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2200 cmp_minus10, cmp_minus11);
2201 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2202 const1, cmp_minus11, diff_minus10, diff_minus11);
2203
2204 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2205 diff_minus11, offset_mask0, offset_mask1);
2206 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2207 offset_mask1);
2208 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2209 src_zero0, offset, dst0);
2210 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2211 sao_offset, offset, offset, offset);
2212 dst0 = __lsx_vxori_b(dst0, 128);
2213 dst0 = __lsx_vsadd_b(dst0, offset);
2214 dst0 = __lsx_vxori_b(dst0, 128);
2215
2216 __lsx_vstelm_d(dst0, dst, 0, 0);
2217 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2218 }
2219
hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)2220 static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst,
2221 int32_t dst_stride,
2222 uint8_t *src,
2223 int32_t src_stride,
2224 int16_t *sao_offset_val,
2225 int32_t width,
2226 int32_t height)
2227 {
2228 uint8_t *src_orig, *dst_orig;
2229 int32_t v_cnt;
2230 const int32_t src_stride_2x = (src_stride << 1);
2231 const int32_t dst_stride_2x = (dst_stride << 1);
2232 const int32_t src_stride_4x = (src_stride << 2);
2233 const int32_t dst_stride_4x = (dst_stride << 2);
2234 const int32_t src_stride_3x = src_stride_2x + src_stride;
2235 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2236
2237 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2238 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2239 __m128i edge_idx = {0x403000201, 0x0};
2240 __m128i const1 = __lsx_vldi(1);
2241 __m128i dst0, dst1, dst2, dst3;
2242 __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2243 __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2244 __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2245 __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2246 __m128i src_plus10, src_plus11, src_plus12, src_plus13;
2247 __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2248 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2249
2250 sao_offset = __lsx_vld(sao_offset_val, 0);
2251 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2252
2253 for (; height; height -= 4) {
2254 src_orig = src - 1;
2255 dst_orig = dst;
2256
2257 src_minus11 = __lsx_vld(src_orig, 0);
2258 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2259 src_plus10, src_plus11);
2260 src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
2261
2262 for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2263 src_minus10 = __lsx_vld(src_orig - src_stride, 2);
2264 src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
2265 src_orig += 16;
2266 src10 = __lsx_vld(src_orig, 0);
2267 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2268 src11, src12);
2269 src13 =__lsx_vldx(src_orig, src_stride_3x);
2270
2271 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
2272 src_plus10, shuf1, src12, src_plus11, shuf1, src13,
2273 src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
2274 src_zero3);
2275 src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
2276 DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
2277 src_plus11, shuf2, src_minus12, src_minus13);
2278
2279 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
2280 src_plus10, src_zero1, src_minus11, src_zero1,
2281 src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2282 cmp_plus11);
2283 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
2284 src_plus12, src_zero3, src_minus13, src_zero3,
2285 src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2286 cmp_plus13);
2287 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2288 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2289 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
2290 diff_plus11);
2291 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2292 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2293 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
2294 diff_plus13);
2295 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
2296 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
2297 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
2298 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
2299 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
2300 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
2301 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2302 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2303 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2304 cmp_plus11);
2305 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2306 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2307 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2308 cmp_plus13);
2309 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2310 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
2311 cmp_minus11, diff_plus11, const1, cmp_plus11,
2312 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
2313 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
2314 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
2315 cmp_minus13, diff_plus13, const1, cmp_plus13,
2316 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
2317
2318 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
2319 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
2320 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
2321 offset_mask3);
2322 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
2323 offset_mask2, 2, offset_mask3, 2, offset_mask0,
2324 offset_mask1, offset_mask2, offset_mask3);
2325
2326 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
2327 sao_offset, sao_offset, offset_mask0, offset_mask0,
2328 offset_mask0);
2329 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
2330 sao_offset, sao_offset, offset_mask1, offset_mask1,
2331 offset_mask1);
2332 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
2333 sao_offset, sao_offset, offset_mask2, offset_mask2,
2334 offset_mask2);
2335 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
2336 sao_offset, sao_offset, offset_mask3, offset_mask3,
2337 offset_mask3);
2338
2339 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
2340 src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
2341 src_zero2, src_zero3);
2342 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
2343 offset_mask1, src_zero2, offset_mask2, src_zero3,
2344 offset_mask3, dst0, dst1, dst2, dst3);
2345 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
2346 128, dst0, dst1, dst2, dst3);
2347
2348 src_minus11 = src10;
2349 src_plus10 = src11;
2350 src_plus11 = src12;
2351 src_plus12 = src13;
2352
2353 __lsx_vst(dst0, dst_orig, 0);
2354 __lsx_vstx(dst1, dst_orig, dst_stride);
2355 __lsx_vstx(dst2, dst_orig, dst_stride_2x);
2356 __lsx_vstx(dst3, dst_orig, dst_stride_3x);
2357 dst_orig += 16;
2358 }
2359
2360 src += src_stride_4x;
2361 dst += dst_stride_4x;
2362 }
2363 }
2364
ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, int16_t *sao_offset_val, int eo, int width, int height)2365 void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
2366 ptrdiff_t stride_dst,
2367 int16_t *sao_offset_val,
2368 int eo, int width, int height)
2369 {
2370 ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
2371
2372 switch (eo) {
2373 case 0:
2374 if (width >> 4) {
2375 hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst,
2376 src, stride_src,
2377 sao_offset_val,
2378 width - (width & 0x0F),
2379 height);
2380 dst += width & 0xFFFFFFF0;
2381 src += width & 0xFFFFFFF0;
2382 width &= 0x0F;
2383 }
2384
2385 if (width >> 3) {
2386 hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst,
2387 src, stride_src,
2388 sao_offset_val, height);
2389 dst += 8;
2390 src += 8;
2391 width &= 0x07;
2392 }
2393
2394 if (width) {
2395 hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst,
2396 src, stride_src,
2397 sao_offset_val, height);
2398 }
2399 break;
2400
2401 case 1:
2402 if (width >> 4) {
2403 hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst,
2404 src, stride_src,
2405 sao_offset_val,
2406 width - (width & 0x0F),
2407 height);
2408 dst += width & 0xFFFFFFF0;
2409 src += width & 0xFFFFFFF0;
2410 width &= 0x0F;
2411 }
2412
2413 if (width >> 3) {
2414 hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst,
2415 src, stride_src,
2416 sao_offset_val, height);
2417 dst += 8;
2418 src += 8;
2419 width &= 0x07;
2420 }
2421
2422 if (width) {
2423 hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst,
2424 src, stride_src,
2425 sao_offset_val, height);
2426 }
2427 break;
2428
2429 case 2:
2430 if (width >> 4) {
2431 hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst,
2432 src, stride_src,
2433 sao_offset_val,
2434 width - (width & 0x0F),
2435 height);
2436 dst += width & 0xFFFFFFF0;
2437 src += width & 0xFFFFFFF0;
2438 width &= 0x0F;
2439 }
2440
2441 if (width >> 3) {
2442 hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst,
2443 src, stride_src,
2444 sao_offset_val, height);
2445 dst += 8;
2446 src += 8;
2447 width &= 0x07;
2448 }
2449
2450 if (width) {
2451 hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst,
2452 src, stride_src,
2453 sao_offset_val, height);
2454 }
2455 break;
2456
2457 case 3:
2458 if (width >> 4) {
2459 hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst,
2460 src, stride_src,
2461 sao_offset_val,
2462 width - (width & 0x0F),
2463 height);
2464 dst += width & 0xFFFFFFF0;
2465 src += width & 0xFFFFFFF0;
2466 width &= 0x0F;
2467 }
2468
2469 if (width >> 3) {
2470 hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst,
2471 src, stride_src,
2472 sao_offset_val, height);
2473 dst += 8;
2474 src += 8;
2475 width &= 0x07;
2476 }
2477
2478 if (width) {
2479 hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst,
2480 src, stride_src,
2481 sao_offset_val, height);
2482 }
2483 break;
2484 }
2485 }
2486