1 /*
2 * Loongson LASX optimized h264dsp
3 *
4 * Copyright (c) 2021 Loongson Technology Corporation Limited
5 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6 * Xiwei Gu <guxiwei-hf@loongson.cn>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "libavutil/loongarch/loongson_intrinsics.h"
26 #include "h264dsp_lasx.h"
27
28 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
29 p1_or_q1_org_in, p2_or_q2_org_in, \
30 neg_tc_in, tc_in, p1_or_q1_out) \
31 { \
32 __m256i clip3, temp; \
33 \
34 clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in, \
35 q0_or_p0_org_in); \
36 temp = __lasx_xvslli_h(p1_or_q1_org_in, 1); \
37 clip3 = __lasx_xvsub_h(clip3, temp); \
38 clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3); \
39 clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in); \
40 p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3); \
41 }
42
43 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
44 p1_or_q1_org_in, q1_or_p1_org_in, \
45 neg_threshold_in, threshold_in, \
46 p0_or_q0_out, q0_or_p0_out) \
47 { \
48 __m256i q0_sub_p0, p1_sub_q1, delta; \
49 \
50 q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in, \
51 p0_or_q0_org_in); \
52 p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in, \
53 q1_or_p1_org_in); \
54 q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2); \
55 p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4); \
56 delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1); \
57 delta = __lasx_xvsrai_h(delta, 3); \
58 delta = __lasx_xvclip_h(delta, neg_threshold_in, \
59 threshold_in); \
60 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta); \
61 q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta); \
62 \
63 p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out); \
64 q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out); \
65 }
66
ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)67 void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
68 int alpha_in, int beta_in, int8_t *tc)
69 {
70 ptrdiff_t img_width_2x = img_width << 1;
71 ptrdiff_t img_width_4x = img_width << 2;
72 ptrdiff_t img_width_8x = img_width << 3;
73 ptrdiff_t img_width_3x = img_width_2x + img_width;
74 __m256i tmp_vec0, bs_vec;
75 __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
76 0x0101010100000000, 0x0303030302020202};
77
78 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
79 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
80 bs_vec = __lasx_xvslti_b(tc_vec, 0);
81 bs_vec = __lasx_xvxori_b(bs_vec, 255);
82 bs_vec = __lasx_xvandi_b(bs_vec, 1);
83
84 if (__lasx_xbnz_v(bs_vec)) {
85 uint8_t *src = data - 4;
86 __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
87 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
88 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
89 __m256i is_bs_greater_than0;
90 __m256i zero = __lasx_xvldi(0);
91
92 is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
93
94 {
95 uint8_t *src_tmp = src + img_width_8x;
96 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
97 __m256i row8, row9, row10, row11, row12, row13, row14, row15;
98
99 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
100 src, img_width_3x, row0, row1, row2, row3);
101 src += img_width_4x;
102 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
103 src, img_width_3x, row4, row5, row6, row7);
104 src -= img_width_4x;
105 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
106 img_width_2x, src_tmp, img_width_3x,
107 row8, row9, row10, row11);
108 src_tmp += img_width_4x;
109 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
110 img_width_2x, src_tmp, img_width_3x,
111 row12, row13, row14, row15);
112 src_tmp -= img_width_4x;
113
114 LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
115 row7, row8, row9, row10, row11,
116 row12, row13, row14, row15,
117 p3_org, p2_org, p1_org, p0_org,
118 q0_org, q1_org, q2_org, q3_org);
119 }
120
121 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
122 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
123 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
124
125 alpha = __lasx_xvreplgr2vr_b(alpha_in);
126 beta = __lasx_xvreplgr2vr_b(beta_in);
127
128 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
129 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
130 is_less_than = is_less_than_alpha & is_less_than_beta;
131 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
132 is_less_than = is_less_than_beta & is_less_than;
133 is_less_than = is_less_than & is_bs_greater_than0;
134
135 if (__lasx_xbnz_v(is_less_than)) {
136 __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
137 __m256i p2_asub_p0, q2_asub_q0;
138
139 neg_tc_h = __lasx_xvneg_b(tc_vec);
140 neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
141 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
142 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
143 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
144 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
145
146 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
147 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
148 is_less_than_beta = is_less_than_beta & is_less_than;
149
150 if (__lasx_xbnz_v(is_less_than_beta)) {
151 __m256i p2_org_h, p1_h;
152
153 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
154 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
155 neg_tc_h, tc_h, p1_h);
156 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
157 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
158 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
159 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
160 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
161 }
162
163 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
164 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
165 is_less_than_beta = is_less_than_beta & is_less_than;
166
167 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
168
169 if (__lasx_xbnz_v(is_less_than_beta)) {
170 __m256i q2_org_h, q1_h;
171
172 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
173 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
174 neg_tc_h, tc_h, q1_h);
175 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
176 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
177 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
178
179 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
180 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
181 }
182
183 {
184 __m256i neg_thresh_h, p0_h, q0_h;
185
186 neg_thresh_h = __lasx_xvneg_b(tc_vec);
187 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
188 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
189
190 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
191 neg_thresh_h, tc_h, p0_h, q0_h);
192 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
193 p0_h, q0_h);
194 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
195 p0_h, q0_h);
196 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
197 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
198 }
199
200 {
201 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
202 __m256i control = {0x0000000400000000, 0x0000000500000001,
203 0x0000000600000002, 0x0000000700000003};
204
205 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
206 q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
207 q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
208 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
209 row0, row2);
210 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
211 row1, row3);
212 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
213 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
214 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
215 control, row7, control, row4, row5, row6, row7);
216 __lasx_xvstelm_d(row4, src, 0, 0);
217 __lasx_xvstelm_d(row4, src + img_width, 0, 1);
218 src += img_width_2x;
219 __lasx_xvstelm_d(row4, src, 0, 2);
220 __lasx_xvstelm_d(row4, src + img_width, 0, 3);
221 src += img_width_2x;
222 __lasx_xvstelm_d(row5, src, 0, 0);
223 __lasx_xvstelm_d(row5, src + img_width, 0, 1);
224 src += img_width_2x;
225 __lasx_xvstelm_d(row5, src, 0, 2);
226 __lasx_xvstelm_d(row5, src + img_width, 0, 3);
227 src += img_width_2x;
228 __lasx_xvstelm_d(row6, src, 0, 0);
229 __lasx_xvstelm_d(row6, src + img_width, 0, 1);
230 src += img_width_2x;
231 __lasx_xvstelm_d(row6, src, 0, 2);
232 __lasx_xvstelm_d(row6, src + img_width, 0, 3);
233 src += img_width_2x;
234 __lasx_xvstelm_d(row7, src, 0, 0);
235 __lasx_xvstelm_d(row7, src + img_width, 0, 1);
236 src += img_width_2x;
237 __lasx_xvstelm_d(row7, src, 0, 2);
238 __lasx_xvstelm_d(row7, src + img_width, 0, 3);
239 }
240 }
241 }
242 }
243
ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)244 void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
245 int alpha_in, int beta_in, int8_t *tc)
246 {
247 ptrdiff_t img_width_2x = img_width << 1;
248 ptrdiff_t img_width_3x = img_width + img_width_2x;
249 __m256i tmp_vec0, bs_vec;
250 __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
251 0x0101010100000000, 0x0303030302020202};
252
253 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
254 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
255 bs_vec = __lasx_xvslti_b(tc_vec, 0);
256 bs_vec = __lasx_xvxori_b(bs_vec, 255);
257 bs_vec = __lasx_xvandi_b(bs_vec, 1);
258
259 if (__lasx_xbnz_v(bs_vec)) {
260 __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
261 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
262 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
263 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
264 __m256i is_bs_greater_than0;
265 __m256i zero = __lasx_xvldi(0);
266
267 alpha = __lasx_xvreplgr2vr_b(alpha_in);
268 beta = __lasx_xvreplgr2vr_b(beta_in);
269
270 DUP2_ARG2(__lasx_xvldx, data, -img_width_3x, data, -img_width_2x,
271 p2_org, p1_org);
272 p0_org = __lasx_xvldx(data, -img_width);
273 DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
274
275 is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
276 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
277 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
278 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
279
280 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
281 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
282 is_less_than = is_less_than_alpha & is_less_than_beta;
283 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
284 is_less_than = is_less_than_beta & is_less_than;
285 is_less_than = is_less_than & is_bs_greater_than0;
286
287 if (__lasx_xbnz_v(is_less_than)) {
288 __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
289
290 q2_org = __lasx_xvldx(data, img_width_2x);
291
292 neg_tc_h = __lasx_xvneg_b(tc_vec);
293 neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
294 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
295 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
296 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
297 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
298
299 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
300 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
301 is_less_than_beta = is_less_than_beta & is_less_than;
302
303 if (__lasx_xbnz_v(is_less_than_beta)) {
304 __m256i p1_h, p2_org_h;
305
306 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
307 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
308 neg_tc_h, tc_h, p1_h);
309 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
310 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
311 p1_h = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
312 p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
313 __lasx_xvst(p1_org, data - img_width_2x, 0);
314
315 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
316 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
317 }
318
319 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
320 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
321 is_less_than_beta = is_less_than_beta & is_less_than;
322
323 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
324
325 if (__lasx_xbnz_v(is_less_than_beta)) {
326 __m256i q1_h, q2_org_h;
327
328 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
329 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
330 neg_tc_h, tc_h, q1_h);
331 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
332 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
333 q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
334 q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
335 __lasx_xvst(q1_org, data + img_width, 0);
336
337 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
338 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
339
340 }
341
342 {
343 __m256i neg_thresh_h, p0_h, q0_h;
344
345 neg_thresh_h = __lasx_xvneg_b(tc_vec);
346 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
347 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
348
349 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
350 neg_thresh_h, tc_h, p0_h, q0_h);
351 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
352 p0_h, q0_h);
353 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
354 p0_h, q0_h);
355 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
356 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
357 p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
358 q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
359 __lasx_xvst(p0_org, data - img_width, 0);
360 __lasx_xvst(q0_org, data, 0);
361 }
362 }
363 }
364 }
365
ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)366 void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
367 int alpha_in, int beta_in, int8_t *tc)
368 {
369 __m256i tmp_vec0, bs_vec;
370 __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
371 __m256i zero = __lasx_xvldi(0);
372 ptrdiff_t img_width_2x = img_width << 1;
373 ptrdiff_t img_width_4x = img_width << 2;
374 ptrdiff_t img_width_3x = img_width_2x + img_width;
375
376 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
377 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
378 bs_vec = __lasx_xvslti_b(tc_vec, 0);
379 bs_vec = __lasx_xvxori_b(bs_vec, 255);
380 bs_vec = __lasx_xvandi_b(bs_vec, 1);
381 bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30);
382
383 if (__lasx_xbnz_v(bs_vec)) {
384 uint8_t *src = data - 2;
385 __m256i p1_org, p0_org, q0_org, q1_org;
386 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
387 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
388 __m256i is_bs_greater_than0;
389
390 is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
391
392 {
393 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
394
395 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
396 src, img_width_3x, row0, row1, row2, row3);
397 src += img_width_4x;
398 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
399 src, img_width_3x, row4, row5, row6, row7);
400 src -= img_width_4x;
401 /* LASX_TRANSPOSE8x4_B */
402 DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
403 row7, row5, p1_org, p0_org, q0_org, q1_org);
404 row0 = __lasx_xvilvl_b(p0_org, p1_org);
405 row1 = __lasx_xvilvl_b(q1_org, q0_org);
406 row3 = __lasx_xvilvh_w(row1, row0);
407 row2 = __lasx_xvilvl_w(row1, row0);
408 p1_org = __lasx_xvpermi_d(row2, 0x00);
409 p0_org = __lasx_xvpermi_d(row2, 0x55);
410 q0_org = __lasx_xvpermi_d(row3, 0x00);
411 q1_org = __lasx_xvpermi_d(row3, 0x55);
412 }
413
414 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
415 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
416 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
417
418 alpha = __lasx_xvreplgr2vr_b(alpha_in);
419 beta = __lasx_xvreplgr2vr_b(beta_in);
420
421 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
422 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
423 is_less_than = is_less_than_alpha & is_less_than_beta;
424 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
425 is_less_than = is_less_than_beta & is_less_than;
426 is_less_than = is_less_than & is_bs_greater_than0;
427
428 if (__lasx_xbnz_v(is_less_than)) {
429 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
430
431 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
432 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
433 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
434 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
435
436 {
437 __m256i tc_h, neg_thresh_h, p0_h, q0_h;
438
439 neg_thresh_h = __lasx_xvneg_b(tc_vec);
440 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
441 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
442
443 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
444 neg_thresh_h, tc_h, p0_h, q0_h);
445 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
446 p0_h, q0_h);
447 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
448 p0_h, q0_h);
449 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
450 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
451 }
452
453 p0_org = __lasx_xvilvl_b(q0_org, p0_org);
454 src = data - 1;
455 __lasx_xvstelm_h(p0_org, src, 0, 0);
456 src += img_width;
457 __lasx_xvstelm_h(p0_org, src, 0, 1);
458 src += img_width;
459 __lasx_xvstelm_h(p0_org, src, 0, 2);
460 src += img_width;
461 __lasx_xvstelm_h(p0_org, src, 0, 3);
462 src += img_width;
463 __lasx_xvstelm_h(p0_org, src, 0, 4);
464 src += img_width;
465 __lasx_xvstelm_h(p0_org, src, 0, 5);
466 src += img_width;
467 __lasx_xvstelm_h(p0_org, src, 0, 6);
468 src += img_width;
469 __lasx_xvstelm_h(p0_org, src, 0, 7);
470 }
471 }
472 }
473
ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)474 void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
475 int alpha_in, int beta_in, int8_t *tc)
476 {
477 int img_width_2x = img_width << 1;
478 __m256i tmp_vec0, bs_vec;
479 __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
480 __m256i zero = __lasx_xvldi(0);
481
482 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
483 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
484 bs_vec = __lasx_xvslti_b(tc_vec, 0);
485 bs_vec = __lasx_xvxori_b(bs_vec, 255);
486 bs_vec = __lasx_xvandi_b(bs_vec, 1);
487 bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30);
488
489 if (__lasx_xbnz_v(bs_vec)) {
490 __m256i p1_org, p0_org, q0_org, q1_org;
491 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
492 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
493 __m256i is_bs_greater_than0;
494
495 alpha = __lasx_xvreplgr2vr_b(alpha_in);
496 beta = __lasx_xvreplgr2vr_b(beta_in);
497
498 DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width,
499 p1_org, p0_org);
500 DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
501
502 is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
503 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
504 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
505 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
506
507 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
508 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
509 is_less_than = is_less_than_alpha & is_less_than_beta;
510 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
511 is_less_than = is_less_than_beta & is_less_than;
512 is_less_than = is_less_than & is_bs_greater_than0;
513
514 if (__lasx_xbnz_v(is_less_than)) {
515 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
516
517 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
518 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
519 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
520 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
521
522 {
523 __m256i neg_thresh_h, tc_h, p0_h, q0_h;
524
525 neg_thresh_h = __lasx_xvneg_b(tc_vec);
526 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
527 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
528
529 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
530 neg_thresh_h, tc_h, p0_h, q0_h);
531 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
532 p0_h, q0_h);
533 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
534 p0_h, q0_h);
535 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
536 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
537 __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
538 __lasx_xvstelm_d(q0_h, data, 0, 0);
539 }
540 }
541 }
542 }
543
544 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
545 q3_or_p3_org_in, p1_or_q1_org_in, \
546 p2_or_q2_org_in, q1_or_p1_org_in, \
547 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
548 { \
549 __m256i threshold; \
550 __m256i const2, const3 = __lasx_xvldi(0); \
551 \
552 const2 = __lasx_xvaddi_hu(const3, 2); \
553 const3 = __lasx_xvaddi_hu(const3, 3); \
554 threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in); \
555 threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold); \
556 \
557 p0_or_q0_out = __lasx_xvslli_h(threshold, 1); \
558 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in); \
559 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in); \
560 p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3); \
561 \
562 p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold); \
563 p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2); \
564 \
565 p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3); \
566 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \
567 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \
568 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold); \
569 p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3); \
570 }
571
572 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
573 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
574 p1_or_q1_org_in, p0_or_q0_out) \
575 { \
576 __m256i const2 = __lasx_xvldi(0); \
577 const2 = __lasx_xvaddi_hu(const2, 2); \
578 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in); \
579 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \
580 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \
581 p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2); \
582 }
583
ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)584 void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
585 int alpha_in, int beta_in)
586 {
587 ptrdiff_t img_width_2x = img_width << 1;
588 ptrdiff_t img_width_4x = img_width << 2;
589 ptrdiff_t img_width_3x = img_width_2x + img_width;
590 uint8_t *src = data - 4;
591 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
592 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
593 __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
594 __m256i zero = __lasx_xvldi(0);
595
596 {
597 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
598 __m256i row8, row9, row10, row11, row12, row13, row14, row15;
599
600 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
601 src, img_width_3x, row0, row1, row2, row3);
602 src += img_width_4x;
603 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
604 src, img_width_3x, row4, row5, row6, row7);
605 src += img_width_4x;
606 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
607 src, img_width_3x, row8, row9, row10, row11);
608 src += img_width_4x;
609 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
610 src, img_width_3x, row12, row13, row14, row15);
611 src += img_width_4x;
612
613 LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
614 row4, row5, row6, row7,
615 row8, row9, row10, row11,
616 row12, row13, row14, row15,
617 p3_org, p2_org, p1_org, p0_org,
618 q0_org, q1_org, q2_org, q3_org);
619 }
620
621 alpha = __lasx_xvreplgr2vr_b(alpha_in);
622 beta = __lasx_xvreplgr2vr_b(beta_in);
623 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
624 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
625 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
626
627 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
628 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
629 is_less_than = is_less_than_beta & is_less_than_alpha;
630 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
631 is_less_than = is_less_than_beta & is_less_than;
632 is_less_than = __lasx_xvpermi_q(zero, is_less_than, 0x30);
633
634 if (__lasx_xbnz_v(is_less_than)) {
635 __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
636 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
637 __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
638
639 less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
640 less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
641 less_alpha_shift2_add2);
642
643 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
644 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
645 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
646 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
647
648 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
649 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
650 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
651 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
652 is_less_than_beta = is_less_than_beta & is_less_than;
653 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
654
655 /* combine and store */
656 if (__lasx_xbnz_v(is_less_than_beta)) {
657 __m256i p2_org_h, p3_org_h, p1_h, p2_h;
658
659 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
660 p3_org_h = __lasx_vext2xv_hu_bu(p3_org);
661
662 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
663 p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
664
665 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
666 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
667 DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
668 DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
669 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
670 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
671 p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
672 }
673
674 AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
675 /* combine */
676 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
677 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
678 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
679
680 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
681 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
682 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
683 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
684 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
685 is_less_than_beta = is_less_than_beta & is_less_than;
686 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
687
688 /* combine and store */
689 if (__lasx_xbnz_v(is_less_than_beta)) {
690 __m256i q2_org_h, q3_org_h, q1_h, q2_h;
691
692 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
693 q3_org_h = __lasx_vext2xv_hu_bu(q3_org);
694
695 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
696 q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
697
698 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
699 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
700 DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
701 DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
702 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
703 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
704 q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
705
706 }
707
708 AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
709
710 /* combine */
711 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
712 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
713 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
714
715 /* transpose and store */
716 {
717 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
718 __m256i control = {0x0000000400000000, 0x0000000500000001,
719 0x0000000600000002, 0x0000000700000003};
720
721 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
722 0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
723 p0_org, p1_org, p2_org, p3_org);
724 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
725 row0, row2);
726 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
727 row1, row3);
728 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
729 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
730 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
731 control, row7, control, row4, row5, row6, row7);
732 src = data - 4;
733 __lasx_xvstelm_d(row4, src, 0, 0);
734 __lasx_xvstelm_d(row4, src + img_width, 0, 1);
735 src += img_width_2x;
736 __lasx_xvstelm_d(row4, src, 0, 2);
737 __lasx_xvstelm_d(row4, src + img_width, 0, 3);
738 src += img_width_2x;
739 __lasx_xvstelm_d(row5, src, 0, 0);
740 __lasx_xvstelm_d(row5, src + img_width, 0, 1);
741 src += img_width_2x;
742 __lasx_xvstelm_d(row5, src, 0, 2);
743 __lasx_xvstelm_d(row5, src + img_width, 0, 3);
744 src += img_width_2x;
745 __lasx_xvstelm_d(row6, src, 0, 0);
746 __lasx_xvstelm_d(row6, src + img_width, 0, 1);
747 src += img_width_2x;
748 __lasx_xvstelm_d(row6, src, 0, 2);
749 __lasx_xvstelm_d(row6, src + img_width, 0, 3);
750 src += img_width_2x;
751 __lasx_xvstelm_d(row7, src, 0, 0);
752 __lasx_xvstelm_d(row7, src + img_width, 0, 1);
753 src += img_width_2x;
754 __lasx_xvstelm_d(row7, src, 0, 2);
755 __lasx_xvstelm_d(row7, src + img_width, 0, 3);
756 }
757 }
758 }
759
ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)760 void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
761 int alpha_in, int beta_in)
762 {
763 ptrdiff_t img_width_2x = img_width << 1;
764 ptrdiff_t img_width_3x = img_width_2x + img_width;
765 uint8_t *src = data - img_width_2x;
766 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
767 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
768 __m256i p1_org, p0_org, q0_org, q1_org;
769 __m256i zero = __lasx_xvldi(0);
770
771 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
772 src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
773 alpha = __lasx_xvreplgr2vr_b(alpha_in);
774 beta = __lasx_xvreplgr2vr_b(beta_in);
775 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
776 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
777 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
778
779 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
780 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
781 is_less_than = is_less_than_beta & is_less_than_alpha;
782 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
783 is_less_than = is_less_than_beta & is_less_than;
784 is_less_than = __lasx_xvpermi_q(zero, is_less_than, 0x30);
785
786 if (__lasx_xbnz_v(is_less_than)) {
787 __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
788 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
789 __m256i p2_org = __lasx_xvldx(src, -img_width);
790 __m256i q2_org = __lasx_xvldx(data, img_width_2x);
791 __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
792 less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
793 less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
794 less_alpha_shift2_add2);
795
796 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
797 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
798 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
799 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
800
801 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
802 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
803 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
804 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
805 is_less_than_beta = is_less_than_beta & is_less_than;
806 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
807
808 /* combine and store */
809 if (__lasx_xbnz_v(is_less_than_beta)) {
810 __m256i p2_org_h, p3_org_h, p1_h, p2_h;
811 __m256i p3_org = __lasx_xvldx(src, -img_width_2x);
812
813 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
814 p3_org_h = __lasx_vext2xv_hu_bu(p3_org);
815
816 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
817 p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
818
819 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
820 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
821 DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
822 DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
823 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
824 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
825 p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
826
827 __lasx_xvst(p1_org, src, 0);
828 __lasx_xvst(p2_org, src - img_width, 0);
829 }
830
831 AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
832 /* combine */
833 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
834 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
835 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
836 __lasx_xvst(p0_org, data - img_width, 0);
837
838 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
839 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
840 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
841 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
842 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
843 is_less_than_beta = is_less_than_beta & is_less_than;
844 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
845
846 /* combine and store */
847 if (__lasx_xbnz_v(is_less_than_beta)) {
848 __m256i q2_org_h, q3_org_h, q1_h, q2_h;
849 __m256i q3_org = __lasx_xvldx(data, img_width_2x + img_width);
850
851 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
852 q3_org_h = __lasx_vext2xv_hu_bu(q3_org);
853
854 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
855 q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
856
857 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
858 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
859 DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
860 DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
861 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
862 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
863 q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
864
865 __lasx_xvst(q1_org, data + img_width, 0);
866 __lasx_xvst(q2_org, data + img_width_2x, 0);
867 }
868
869 AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
870
871 /* combine */
872 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
873 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
874 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
875
876 __lasx_xvst(q0_org, data, 0);
877 }
878 }
879
ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)880 void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
881 int alpha_in, int beta_in)
882 {
883 uint8_t *src = data - 2;
884 ptrdiff_t img_width_2x = img_width << 1;
885 ptrdiff_t img_width_4x = img_width << 2;
886 ptrdiff_t img_width_3x = img_width_2x + img_width;
887 __m256i p1_org, p0_org, q0_org, q1_org;
888 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
889 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
890
891 {
892 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
893
894 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
895 img_width_3x, row0, row1, row2, row3);
896 src += img_width_4x;
897 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
898 img_width_3x, row4, row5, row6, row7);
899
900 /* LASX_TRANSPOSE8x4_B */
901 DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
902 p1_org, p0_org, q0_org, q1_org);
903 row0 = __lasx_xvilvl_b(p0_org, p1_org);
904 row1 = __lasx_xvilvl_b(q1_org, q0_org);
905 row3 = __lasx_xvilvh_w(row1, row0);
906 row2 = __lasx_xvilvl_w(row1, row0);
907 p1_org = __lasx_xvpermi_d(row2, 0x00);
908 p0_org = __lasx_xvpermi_d(row2, 0x55);
909 q0_org = __lasx_xvpermi_d(row3, 0x00);
910 q1_org = __lasx_xvpermi_d(row3, 0x55);
911 }
912
913 alpha = __lasx_xvreplgr2vr_b(alpha_in);
914 beta = __lasx_xvreplgr2vr_b(beta_in);
915
916 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
917 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
918 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
919
920 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
921 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
922 is_less_than = is_less_than_alpha & is_less_than_beta;
923 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
924 is_less_than = is_less_than_beta & is_less_than;
925
926 if (__lasx_xbnz_v(is_less_than)) {
927 __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
928
929 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
930 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
931 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
932 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
933
934 AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
935 AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
936 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
937 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
938 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
939 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
940 }
941 p0_org = __lasx_xvilvl_b(q0_org, p0_org);
942 src = data - 1;
943 __lasx_xvstelm_h(p0_org, src, 0, 0);
944 src += img_width;
945 __lasx_xvstelm_h(p0_org, src, 0, 1);
946 src += img_width;
947 __lasx_xvstelm_h(p0_org, src, 0, 2);
948 src += img_width;
949 __lasx_xvstelm_h(p0_org, src, 0, 3);
950 src += img_width;
951 __lasx_xvstelm_h(p0_org, src, 0, 4);
952 src += img_width;
953 __lasx_xvstelm_h(p0_org, src, 0, 5);
954 src += img_width;
955 __lasx_xvstelm_h(p0_org, src, 0, 6);
956 src += img_width;
957 __lasx_xvstelm_h(p0_org, src, 0, 7);
958 }
959
ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)960 void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
961 int alpha_in, int beta_in)
962 {
963 ptrdiff_t img_width_2x = img_width << 1;
964 __m256i p1_org, p0_org, q0_org, q1_org;
965 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
966 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
967
968 alpha = __lasx_xvreplgr2vr_b(alpha_in);
969 beta = __lasx_xvreplgr2vr_b(beta_in);
970
971 p1_org = __lasx_xvldx(data, -img_width_2x);
972 p0_org = __lasx_xvldx(data, -img_width);
973 DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
974
975 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
976 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
977 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
978
979 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
980 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
981 is_less_than = is_less_than_alpha & is_less_than_beta;
982 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
983 is_less_than = is_less_than_beta & is_less_than;
984
985 if (__lasx_xbnz_v(is_less_than)) {
986 __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
987
988 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
989 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
990 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
991 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
992
993 AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
994 AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
995 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
996 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
997 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
998 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
999 __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
1000 __lasx_xvstelm_d(q0_h, data, 0, 0);
1001 }
1002 }
1003
ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset_in)1004 void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
1005 ptrdiff_t stride, int height,
1006 int log2_denom, int weight_dst,
1007 int weight_src, int offset_in)
1008 {
1009 __m256i wgt;
1010 __m256i src0, src1, src2, src3;
1011 __m256i dst0, dst1, dst2, dst3;
1012 __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1013 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1014 __m256i denom, offset;
1015 int stride_2x = stride << 1;
1016 int stride_4x = stride << 2;
1017 int stride_3x = stride_2x + stride;
1018
1019 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1020 offset_in += ((weight_src + weight_dst) << 7);
1021 log2_denom += 1;
1022
1023 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1024 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1025 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1026 offset = __lasx_xvreplgr2vr_h(offset_in);
1027 denom = __lasx_xvreplgr2vr_h(log2_denom);
1028
1029 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1030 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1031 src += stride_4x;
1032 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1033 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1034 src += stride_4x;
1035 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1036 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1037 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1038 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1039 dst += stride_4x;
1040 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1041 dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1042 dst -= stride_4x;
1043 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1044 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1045
1046 DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1047 src0, src1, src2, src3);
1048 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1049 dst0, dst1, dst2, dst3);
1050 DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1051 dst3, src3, vec0, vec2, vec4, vec6);
1052 DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1053 dst3, src3, vec1, vec3, vec5, vec7);
1054
1055 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1056 offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1057 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1058 offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1059
1060 tmp0 = __lasx_xvsra_h(tmp0, denom);
1061 tmp1 = __lasx_xvsra_h(tmp1, denom);
1062 tmp2 = __lasx_xvsra_h(tmp2, denom);
1063 tmp3 = __lasx_xvsra_h(tmp3, denom);
1064 tmp4 = __lasx_xvsra_h(tmp4, denom);
1065 tmp5 = __lasx_xvsra_h(tmp5, denom);
1066 tmp6 = __lasx_xvsra_h(tmp6, denom);
1067 tmp7 = __lasx_xvsra_h(tmp7, denom);
1068
1069 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1070 tmp0, tmp1, tmp2, tmp3);
1071 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1072 tmp4, tmp5, tmp6, tmp7);
1073 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1074 dst0, dst1, dst2, dst3);
1075 __lasx_xvstelm_d(dst0, dst, 0, 0);
1076 __lasx_xvstelm_d(dst0, dst, 8, 1);
1077 dst += stride;
1078 __lasx_xvstelm_d(dst0, dst, 0, 2);
1079 __lasx_xvstelm_d(dst0, dst, 8, 3);
1080 dst += stride;
1081 __lasx_xvstelm_d(dst1, dst, 0, 0);
1082 __lasx_xvstelm_d(dst1, dst, 8, 1);
1083 dst += stride;
1084 __lasx_xvstelm_d(dst1, dst, 0, 2);
1085 __lasx_xvstelm_d(dst1, dst, 8, 3);
1086 dst += stride;
1087 __lasx_xvstelm_d(dst2, dst, 0, 0);
1088 __lasx_xvstelm_d(dst2, dst, 8, 1);
1089 dst += stride;
1090 __lasx_xvstelm_d(dst2, dst, 0, 2);
1091 __lasx_xvstelm_d(dst2, dst, 8, 3);
1092 dst += stride;
1093 __lasx_xvstelm_d(dst3, dst, 0, 0);
1094 __lasx_xvstelm_d(dst3, dst, 8, 1);
1095 dst += stride;
1096 __lasx_xvstelm_d(dst3, dst, 0, 2);
1097 __lasx_xvstelm_d(dst3, dst, 8, 3);
1098 dst += stride;
1099
1100 if (16 == height) {
1101 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1102 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1103 src += stride_4x;
1104 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1105 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1106 src += stride_4x;
1107 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1108 tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1109 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1110 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1111 dst += stride_4x;
1112 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1113 dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1114 dst -= stride_4x;
1115 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1116 tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1117
1118 DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1119 src0, src1, src2, src3);
1120 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1121 dst0, dst1, dst2, dst3);
1122 DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1123 dst3, src3, vec0, vec2, vec4, vec6);
1124 DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1125 dst3, src3, vec1, vec3, vec5, vec7);
1126
1127 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1128 offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1129 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1130 offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1131
1132 tmp0 = __lasx_xvsra_h(tmp0, denom);
1133 tmp1 = __lasx_xvsra_h(tmp1, denom);
1134 tmp2 = __lasx_xvsra_h(tmp2, denom);
1135 tmp3 = __lasx_xvsra_h(tmp3, denom);
1136 tmp4 = __lasx_xvsra_h(tmp4, denom);
1137 tmp5 = __lasx_xvsra_h(tmp5, denom);
1138 tmp6 = __lasx_xvsra_h(tmp6, denom);
1139 tmp7 = __lasx_xvsra_h(tmp7, denom);
1140
1141 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1142 tmp0, tmp1, tmp2, tmp3);
1143 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1144 tmp4, tmp5, tmp6, tmp7);
1145 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
1146 tmp6, dst0, dst1, dst2, dst3);
1147 __lasx_xvstelm_d(dst0, dst, 0, 0);
1148 __lasx_xvstelm_d(dst0, dst, 8, 1);
1149 dst += stride;
1150 __lasx_xvstelm_d(dst0, dst, 0, 2);
1151 __lasx_xvstelm_d(dst0, dst, 8, 3);
1152 dst += stride;
1153 __lasx_xvstelm_d(dst1, dst, 0, 0);
1154 __lasx_xvstelm_d(dst1, dst, 8, 1);
1155 dst += stride;
1156 __lasx_xvstelm_d(dst1, dst, 0, 2);
1157 __lasx_xvstelm_d(dst1, dst, 8, 3);
1158 dst += stride;
1159 __lasx_xvstelm_d(dst2, dst, 0, 0);
1160 __lasx_xvstelm_d(dst2, dst, 8, 1);
1161 dst += stride;
1162 __lasx_xvstelm_d(dst2, dst, 0, 2);
1163 __lasx_xvstelm_d(dst2, dst, 8, 3);
1164 dst += stride;
1165 __lasx_xvstelm_d(dst3, dst, 0, 0);
1166 __lasx_xvstelm_d(dst3, dst, 8, 1);
1167 dst += stride;
1168 __lasx_xvstelm_d(dst3, dst, 0, 2);
1169 __lasx_xvstelm_d(dst3, dst, 8, 3);
1170 }
1171 }
1172
avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1173 static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1174 int32_t log2_denom, int32_t weight_src,
1175 int32_t weight_dst, int32_t offset_in)
1176 {
1177 __m256i wgt, vec0, vec1;
1178 __m256i src0, dst0;
1179 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1180 ptrdiff_t stride_2x = stride << 1;
1181 ptrdiff_t stride_3x = stride_2x + stride;
1182
1183 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1184 offset_in += ((weight_src + weight_dst) << 7);
1185 log2_denom += 1;
1186
1187 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1188 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1189 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1190 offset = __lasx_xvreplgr2vr_h(offset_in);
1191 denom = __lasx_xvreplgr2vr_h(log2_denom);
1192
1193 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1194 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1195 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1196 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1197 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1198 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1199 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1200 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1201 DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1202 vec0 = __lasx_xvilvl_b(dst0, src0);
1203 vec1 = __lasx_xvilvh_b(dst0, src0);
1204 DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1205 tmp0, tmp1);
1206 tmp0 = __lasx_xvsra_h(tmp0, denom);
1207 tmp1 = __lasx_xvsra_h(tmp1, denom);
1208 DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1209 dst0 = __lasx_xvpickev_b(tmp1, tmp0);
1210 __lasx_xvstelm_d(dst0, dst, 0, 0);
1211 __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1212 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1213 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1214 }
1215
avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1216 static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1217 int32_t log2_denom, int32_t weight_src,
1218 int32_t weight_dst, int32_t offset_in)
1219 {
1220 __m256i wgt, vec0, vec1, vec2, vec3;
1221 __m256i src0, src1, dst0, dst1;
1222 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1223 ptrdiff_t stride_2x = stride << 1;
1224 ptrdiff_t stride_4x = stride << 2;
1225 ptrdiff_t stride_3x = stride_2x + stride;
1226 uint8_t* dst_tmp = dst;
1227
1228 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1229 offset_in += ((weight_src + weight_dst) << 7);
1230 log2_denom += 1;
1231
1232 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1233 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1234 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1235 offset = __lasx_xvreplgr2vr_h(offset_in);
1236 denom = __lasx_xvreplgr2vr_h(log2_denom);
1237
1238 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1239 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1240 src += stride_4x;
1241 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1242 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1243 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1244 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1245 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1246 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1247 tmp0 = __lasx_xvld(dst_tmp, 0);
1248 DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2);
1249 tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
1250 dst_tmp += stride_4x;
1251 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1252 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1253 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1254 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1255 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1256 dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1257
1258 DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128,
1259 src0, src1, dst0, dst1);
1260 DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2);
1261 DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3);
1262 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1263 offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1264 tmp0 = __lasx_xvsra_h(tmp0, denom);
1265 tmp1 = __lasx_xvsra_h(tmp1, denom);
1266 tmp2 = __lasx_xvsra_h(tmp2, denom);
1267 tmp3 = __lasx_xvsra_h(tmp3, denom);
1268 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1269 tmp0, tmp1, tmp2, tmp3);
1270 DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
1271 __lasx_xvstelm_d(dst0, dst, 0, 0);
1272 __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1273 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1274 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1275 dst += stride_4x;
1276 __lasx_xvstelm_d(dst1, dst, 0, 0);
1277 __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1278 __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1279 __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1280 }
1281
avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1282 static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1283 int32_t log2_denom, int32_t weight_src,
1284 int32_t weight_dst, int32_t offset_in)
1285 {
1286 __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1287 __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1288 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1289 ptrdiff_t stride_2x = stride << 1;
1290 ptrdiff_t stride_4x = stride << 2;
1291 ptrdiff_t stride_3x = stride_2x + stride;
1292 uint8_t* dst_tmp = dst;
1293
1294 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1295 offset_in += ((weight_src + weight_dst) << 7);
1296 log2_denom += 1;
1297
1298 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1299 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1300 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1301 offset = __lasx_xvreplgr2vr_h(offset_in);
1302 denom = __lasx_xvreplgr2vr_h(log2_denom);
1303
1304 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1305 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1306 src += stride_4x;
1307 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1308 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1309 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1310 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1311 src += stride_4x;
1312 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1313 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1314 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1315 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1316 src += stride_4x;
1317 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1318 src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1319 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1320 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1321 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1322 src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1323
1324 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1325 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1326 dst_tmp += stride_4x;
1327 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1328 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1329 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1330 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1331 dst_tmp += stride_4x;
1332 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1333 dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1334 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1335 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1336 dst_tmp += stride_4x;
1337 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1338 dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1339 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1340 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1341 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1342 dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1343
1344 DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1345 src0, src1, src2, src3);
1346 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1347 dst0, dst1, dst2, dst3);
1348 DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1349 dst3, src3, vec0, vec2, vec4, vec6);
1350 DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1351 dst3, src3, vec1, vec3, vec5, vec7);
1352 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1353 offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1354 DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5,
1355 offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1356 tmp0 = __lasx_xvsra_h(tmp0, denom);
1357 tmp1 = __lasx_xvsra_h(tmp1, denom);
1358 tmp2 = __lasx_xvsra_h(tmp2, denom);
1359 tmp3 = __lasx_xvsra_h(tmp3, denom);
1360 tmp4 = __lasx_xvsra_h(tmp4, denom);
1361 tmp5 = __lasx_xvsra_h(tmp5, denom);
1362 tmp6 = __lasx_xvsra_h(tmp6, denom);
1363 tmp7 = __lasx_xvsra_h(tmp7, denom);
1364 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1365 tmp0, tmp1, tmp2, tmp3);
1366 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1367 tmp4, tmp5, tmp6, tmp7);
1368 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1369 dst0, dst1, dst2, dst3)
1370 __lasx_xvstelm_d(dst0, dst, 0, 0);
1371 __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1372 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1373 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1374 dst += stride_4x;
1375 __lasx_xvstelm_d(dst1, dst, 0, 0);
1376 __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1377 __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1378 __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1379 dst += stride_4x;
1380 __lasx_xvstelm_d(dst2, dst, 0, 0);
1381 __lasx_xvstelm_d(dst2, dst + stride, 0, 1);
1382 __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
1383 __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
1384 dst += stride_4x;
1385 __lasx_xvstelm_d(dst3, dst, 0, 0);
1386 __lasx_xvstelm_d(dst3, dst + stride, 0, 1);
1387 __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
1388 __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
1389 }
1390
ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)1391 void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
1392 ptrdiff_t stride, int height,
1393 int log2_denom, int weight_dst,
1394 int weight_src, int offset)
1395 {
1396 if (4 == height) {
1397 avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1398 offset);
1399 } else if (8 == height) {
1400 avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1401 offset);
1402 } else {
1403 avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1404 offset);
1405 }
1406 }
1407
avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1408 static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1409 int32_t log2_denom, int32_t weight_src,
1410 int32_t weight_dst, int32_t offset_in)
1411 {
1412 __m256i wgt, vec0;
1413 __m256i src0, dst0;
1414 __m256i tmp0, tmp1, denom, offset;
1415
1416 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1417 offset_in += ((weight_src + weight_dst) << 7);
1418 log2_denom += 1;
1419
1420 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1421 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1422 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1423 offset = __lasx_xvreplgr2vr_h(offset_in);
1424 denom = __lasx_xvreplgr2vr_h(log2_denom);
1425
1426 DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1427 src0 = __lasx_xvilvl_w(tmp1, tmp0);
1428 DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1);
1429 dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1430 DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1431 vec0 = __lasx_xvilvl_b(dst0, src0);
1432 tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1433 tmp0 = __lasx_xvsra_h(tmp0, denom);
1434 tmp0 = __lasx_xvclip255_h(tmp0);
1435 tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1436 __lasx_xvstelm_w(tmp0, dst, 0, 0);
1437 __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1438 }
1439
avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1440 static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1441 int32_t log2_denom, int32_t weight_src,
1442 int32_t weight_dst, int32_t offset_in)
1443 {
1444 __m256i wgt, vec0;
1445 __m256i src0, dst0;
1446 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1447 ptrdiff_t stride_2x = stride << 1;
1448 ptrdiff_t stride_3x = stride_2x + stride;
1449
1450 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1451 offset_in += ((weight_src + weight_dst) << 7);
1452 log2_denom += 1;
1453
1454 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1455 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1456 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1457 offset = __lasx_xvreplgr2vr_h(offset_in);
1458 denom = __lasx_xvreplgr2vr_h(log2_denom);
1459
1460 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1461 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1462 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1463 src0 = __lasx_xvilvl_w(tmp1, tmp0);
1464 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1465 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1466 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1467 dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1468 DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1469 vec0 = __lasx_xvilvl_b(dst0, src0);
1470 dst0 = __lasx_xvilvh_b(dst0, src0);
1471 vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
1472 tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1473 tmp0 = __lasx_xvsra_h(tmp0, denom);
1474 tmp0 = __lasx_xvclip255_h(tmp0);
1475 tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1476 __lasx_xvstelm_w(tmp0, dst, 0, 0);
1477 __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1478 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
1479 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
1480 }
1481
avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1482 static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1483 int32_t log2_denom, int32_t weight_src,
1484 int32_t weight_dst, int32_t offset_in)
1485 {
1486 __m256i wgt, vec0, vec1;
1487 __m256i src0, dst0;
1488 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1489 ptrdiff_t stride_2x = stride << 1;
1490 ptrdiff_t stride_4x = stride << 2;
1491 ptrdiff_t stride_3x = stride_2x + stride;
1492
1493 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1494 offset_in += ((weight_src + weight_dst) << 7);
1495 log2_denom += 1;
1496
1497 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1498 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1499 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1500 offset = __lasx_xvreplgr2vr_h(offset_in);
1501 denom = __lasx_xvreplgr2vr_h(log2_denom);
1502
1503 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1504 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1505 src += stride_4x;
1506 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1507 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1508 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1509 tmp0, tmp1, tmp2, tmp3);
1510 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1511 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1512 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1513 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1514 dst += stride_4x;
1515 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1516 dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1517 dst -= stride_4x;
1518 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1519 tmp0, tmp1, tmp2, tmp3);
1520 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1521 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1522 DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1523 vec0 = __lasx_xvilvl_b(dst0, src0);
1524 vec1 = __lasx_xvilvh_b(dst0, src0);
1525 DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1526 tmp0, tmp1);
1527 tmp0 = __lasx_xvsra_h(tmp0, denom);
1528 tmp1 = __lasx_xvsra_h(tmp1, denom);
1529 DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1530 tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
1531 __lasx_xvstelm_w(tmp0, dst, 0, 0);
1532 __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1533 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
1534 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
1535 dst += stride_4x;
1536 __lasx_xvstelm_w(tmp0, dst, 0, 4);
1537 __lasx_xvstelm_w(tmp0, dst + stride, 0, 5);
1538 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
1539 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
1540 }
1541
ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)1542 void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
1543 ptrdiff_t stride, int height,
1544 int log2_denom, int weight_dst,
1545 int weight_src, int offset)
1546 {
1547 if (2 == height) {
1548 avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src,
1549 weight_dst, offset);
1550 } else if (4 == height) {
1551 avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src,
1552 weight_dst, offset);
1553 } else {
1554 avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src,
1555 weight_dst, offset);
1556 }
1557 }
1558
ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset_in)1559 void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
1560 int height, int log2_denom,
1561 int weight_src, int offset_in)
1562 {
1563 uint32_t offset_val;
1564 ptrdiff_t stride_2x = stride << 1;
1565 ptrdiff_t stride_4x = stride << 2;
1566 ptrdiff_t stride_3x = stride_2x + stride;
1567 __m256i zero = __lasx_xvldi(0);
1568 __m256i src0, src1, src2, src3;
1569 __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
1570 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1571 __m256i wgt, denom, offset;
1572
1573 offset_val = (unsigned) offset_in << log2_denom;
1574
1575 wgt = __lasx_xvreplgr2vr_h(weight_src);
1576 offset = __lasx_xvreplgr2vr_h(offset_val);
1577 denom = __lasx_xvreplgr2vr_h(log2_denom);
1578
1579 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1580 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1581 src += stride_4x;
1582 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1583 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1584 src -= stride_4x;
1585 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1586 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1587 DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1588 zero, src3, src0_l, src1_l, src2_l, src3_l);
1589 DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1590 zero, src3, src0_h, src1_h, src2_h, src3_h);
1591 src0_l = __lasx_xvmul_h(wgt, src0_l);
1592 src0_h = __lasx_xvmul_h(wgt, src0_h);
1593 src1_l = __lasx_xvmul_h(wgt, src1_l);
1594 src1_h = __lasx_xvmul_h(wgt, src1_h);
1595 src2_l = __lasx_xvmul_h(wgt, src2_l);
1596 src2_h = __lasx_xvmul_h(wgt, src2_h);
1597 src3_l = __lasx_xvmul_h(wgt, src3_l);
1598 src3_h = __lasx_xvmul_h(wgt, src3_h);
1599 DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1600 src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1601 DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1602 src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1603 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1604 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1605 src1_l = __lasx_xvmaxi_h(src1_l, 0);
1606 src1_h = __lasx_xvmaxi_h(src1_h, 0);
1607 src2_l = __lasx_xvmaxi_h(src2_l, 0);
1608 src2_h = __lasx_xvmaxi_h(src2_h, 0);
1609 src3_l = __lasx_xvmaxi_h(src3_l, 0);
1610 src3_h = __lasx_xvmaxi_h(src3_h, 0);
1611 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1612 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1613 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1614 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1615 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1616 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1617 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1618 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1619 __lasx_xvstelm_d(src0_l, src, 0, 0);
1620 __lasx_xvstelm_d(src0_h, src, 8, 0);
1621 src += stride;
1622 __lasx_xvstelm_d(src0_l, src, 0, 2);
1623 __lasx_xvstelm_d(src0_h, src, 8, 2);
1624 src += stride;
1625 __lasx_xvstelm_d(src1_l, src, 0, 0);
1626 __lasx_xvstelm_d(src1_h, src, 8, 0);
1627 src += stride;
1628 __lasx_xvstelm_d(src1_l, src, 0, 2);
1629 __lasx_xvstelm_d(src1_h, src, 8, 2);
1630 src += stride;
1631 __lasx_xvstelm_d(src2_l, src, 0, 0);
1632 __lasx_xvstelm_d(src2_h, src, 8, 0);
1633 src += stride;
1634 __lasx_xvstelm_d(src2_l, src, 0, 2);
1635 __lasx_xvstelm_d(src2_h, src, 8, 2);
1636 src += stride;
1637 __lasx_xvstelm_d(src3_l, src, 0, 0);
1638 __lasx_xvstelm_d(src3_h, src, 8, 0);
1639 src += stride;
1640 __lasx_xvstelm_d(src3_l, src, 0, 2);
1641 __lasx_xvstelm_d(src3_h, src, 8, 2);
1642 src += stride;
1643
1644 if (16 == height) {
1645 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1646 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1647 src += stride_4x;
1648 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1649 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1650 src -= stride_4x;
1651 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1652 tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1653 DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1654 zero, src3, src0_l, src1_l, src2_l, src3_l);
1655 DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1656 zero, src3, src0_h, src1_h, src2_h, src3_h);
1657 src0_l = __lasx_xvmul_h(wgt, src0_l);
1658 src0_h = __lasx_xvmul_h(wgt, src0_h);
1659 src1_l = __lasx_xvmul_h(wgt, src1_l);
1660 src1_h = __lasx_xvmul_h(wgt, src1_h);
1661 src2_l = __lasx_xvmul_h(wgt, src2_l);
1662 src2_h = __lasx_xvmul_h(wgt, src2_h);
1663 src3_l = __lasx_xvmul_h(wgt, src3_l);
1664 src3_h = __lasx_xvmul_h(wgt, src3_h);
1665 DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l,
1666 offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1667 DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l,
1668 offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1669 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1670 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1671 src1_l = __lasx_xvmaxi_h(src1_l, 0);
1672 src1_h = __lasx_xvmaxi_h(src1_h, 0);
1673 src2_l = __lasx_xvmaxi_h(src2_l, 0);
1674 src2_h = __lasx_xvmaxi_h(src2_h, 0);
1675 src3_l = __lasx_xvmaxi_h(src3_l, 0);
1676 src3_h = __lasx_xvmaxi_h(src3_h, 0);
1677 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1678 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1679 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1680 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1681 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1682 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1683 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1684 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1685 __lasx_xvstelm_d(src0_l, src, 0, 0);
1686 __lasx_xvstelm_d(src0_h, src, 8, 0);
1687 src += stride;
1688 __lasx_xvstelm_d(src0_l, src, 0, 2);
1689 __lasx_xvstelm_d(src0_h, src, 8, 2);
1690 src += stride;
1691 __lasx_xvstelm_d(src1_l, src, 0, 0);
1692 __lasx_xvstelm_d(src1_h, src, 8, 0);
1693 src += stride;
1694 __lasx_xvstelm_d(src1_l, src, 0, 2);
1695 __lasx_xvstelm_d(src1_h, src, 8, 2);
1696 src += stride;
1697 __lasx_xvstelm_d(src2_l, src, 0, 0);
1698 __lasx_xvstelm_d(src2_h, src, 8, 0);
1699 src += stride;
1700 __lasx_xvstelm_d(src2_l, src, 0, 2);
1701 __lasx_xvstelm_d(src2_h, src, 8, 2);
1702 src += stride;
1703 __lasx_xvstelm_d(src3_l, src, 0, 0);
1704 __lasx_xvstelm_d(src3_h, src, 8, 0);
1705 src += stride;
1706 __lasx_xvstelm_d(src3_l, src, 0, 2);
1707 __lasx_xvstelm_d(src3_h, src, 8, 2);
1708 }
1709 }
1710
avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)1711 static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride,
1712 int32_t log2_denom, int32_t weight_src,
1713 int32_t offset_in)
1714 {
1715 uint32_t offset_val;
1716 ptrdiff_t stride_2x = stride << 1;
1717 ptrdiff_t stride_3x = stride_2x + stride;
1718 __m256i wgt, zero = __lasx_xvldi(0);
1719 __m256i src0, src0_h, src0_l;
1720 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1721
1722 offset_val = (unsigned) offset_in << log2_denom;
1723
1724 wgt = __lasx_xvreplgr2vr_h(weight_src);
1725 offset = __lasx_xvreplgr2vr_h(offset_val);
1726 denom = __lasx_xvreplgr2vr_h(log2_denom);
1727
1728 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1729 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1730 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1731 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1732 src0_l = __lasx_xvilvl_b(zero, src0);
1733 src0_h = __lasx_xvilvh_b(zero, src0);
1734 src0_l = __lasx_xvmul_h(wgt, src0_l);
1735 src0_h = __lasx_xvmul_h(wgt, src0_h);
1736 src0_l = __lasx_xvsadd_h(src0_l, offset);
1737 src0_h = __lasx_xvsadd_h(src0_h, offset);
1738 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1739 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1740 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1741 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1742
1743 src0 = __lasx_xvpickev_d(src0_h, src0_l);
1744 __lasx_xvstelm_d(src0, src, 0, 0);
1745 __lasx_xvstelm_d(src0, src + stride, 0, 1);
1746 __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1747 __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1748 }
1749
avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)1750 static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom,
1751 int32_t src_weight, int32_t offset_in)
1752 {
1753 __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0);
1754 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1755 uint32_t offset_val;
1756 uint8_t* src_tmp = src;
1757 ptrdiff_t stride_2x = stride << 1;
1758 ptrdiff_t stride_4x = stride << 2;
1759 ptrdiff_t stride_3x = stride_2x + stride;
1760
1761 offset_val = (unsigned) offset_in << log2_denom;
1762
1763 wgt = __lasx_xvreplgr2vr_h(src_weight);
1764 offset = __lasx_xvreplgr2vr_h(offset_val);
1765 denom = __lasx_xvreplgr2vr_h(log2_denom);
1766
1767 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1768 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1769 src_tmp += stride_4x;
1770 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1771 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1772 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1773 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1774 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1775 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1776 DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l);
1777 DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h);
1778 src0_l = __lasx_xvmul_h(wgt, src0_l);
1779 src0_h = __lasx_xvmul_h(wgt, src0_h);
1780 src1_l = __lasx_xvmul_h(wgt, src1_l);
1781 src1_h = __lasx_xvmul_h(wgt, src1_h);
1782 DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1783 src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1784 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1785 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1786 src1_l = __lasx_xvmaxi_h(src1_l, 0);
1787 src1_h = __lasx_xvmaxi_h(src1_h, 0);
1788 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1789 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1790 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1791 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1792
1793 DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1);
1794 __lasx_xvstelm_d(src0, src, 0, 0);
1795 __lasx_xvstelm_d(src0, src + stride, 0, 1);
1796 __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1797 __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1798 src += stride_4x;
1799 __lasx_xvstelm_d(src1, src, 0, 0);
1800 __lasx_xvstelm_d(src1, src + stride, 0, 1);
1801 __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1802 __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1803 }
1804
avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)1805 static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride,
1806 int32_t log2_denom, int32_t src_weight,
1807 int32_t offset_in)
1808 {
1809 __m256i src0, src1, src2, src3;
1810 __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
1811 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1812 __m256i zero = __lasx_xvldi(0);
1813 uint32_t offset_val;
1814 uint8_t* src_tmp = src;
1815 ptrdiff_t stride_2x = stride << 1;
1816 ptrdiff_t stride_4x = stride << 2;
1817 ptrdiff_t stride_3x = stride_2x + stride;
1818
1819 offset_val = (unsigned) offset_in << log2_denom;
1820
1821 wgt = __lasx_xvreplgr2vr_h(src_weight);
1822 offset = __lasx_xvreplgr2vr_h(offset_val);
1823 denom = __lasx_xvreplgr2vr_h(log2_denom);
1824
1825 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1826 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1827 src_tmp += stride_4x;
1828 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1829 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1830 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1831 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1832 src_tmp += stride_4x;
1833 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1834 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1835 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1836 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1837 src_tmp += stride_4x;
1838 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1839 src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1840 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1841 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1842 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1843 src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1844
1845 DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3,
1846 src0_l, src1_l, src2_l, src3_l);
1847 DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
1848 src0_h, src1_h, src2_h, src3_h);
1849 src0_l = __lasx_xvmul_h(wgt, src0_l);
1850 src0_h = __lasx_xvmul_h(wgt, src0_h);
1851 src1_l = __lasx_xvmul_h(wgt, src1_l);
1852 src1_h = __lasx_xvmul_h(wgt, src1_h);
1853 src2_l = __lasx_xvmul_h(wgt, src2_l);
1854 src2_h = __lasx_xvmul_h(wgt, src2_h);
1855 src3_l = __lasx_xvmul_h(wgt, src3_l);
1856 src3_h = __lasx_xvmul_h(wgt, src3_h);
1857
1858 DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1859 src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1860 DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1861 src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1862
1863 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1864 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1865 src1_l = __lasx_xvmaxi_h(src1_l, 0);
1866 src1_h = __lasx_xvmaxi_h(src1_h, 0);
1867 src2_l = __lasx_xvmaxi_h(src2_l, 0);
1868 src2_h = __lasx_xvmaxi_h(src2_h, 0);
1869 src3_l = __lasx_xvmaxi_h(src3_l, 0);
1870 src3_h = __lasx_xvmaxi_h(src3_h, 0);
1871 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1872 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1873 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1874 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1875 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1876 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1877 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1878 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1879 DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
1880 src3_h, src3_l, src0, src1, src2, src3);
1881
1882 __lasx_xvstelm_d(src0, src, 0, 0);
1883 __lasx_xvstelm_d(src0, src + stride, 0, 1);
1884 __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1885 __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1886 src += stride_4x;
1887 __lasx_xvstelm_d(src1, src, 0, 0);
1888 __lasx_xvstelm_d(src1, src + stride, 0, 1);
1889 __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1890 __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1891 src += stride_4x;
1892 __lasx_xvstelm_d(src2, src, 0, 0);
1893 __lasx_xvstelm_d(src2, src + stride, 0, 1);
1894 __lasx_xvstelm_d(src2, src + stride_2x, 0, 2);
1895 __lasx_xvstelm_d(src2, src + stride_3x, 0, 3);
1896 src += stride_4x;
1897 __lasx_xvstelm_d(src3, src, 0, 0);
1898 __lasx_xvstelm_d(src3, src + stride, 0, 1);
1899 __lasx_xvstelm_d(src3, src + stride_2x, 0, 2);
1900 __lasx_xvstelm_d(src3, src + stride_3x, 0, 3);
1901 }
1902
ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)1903 void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
1904 int height, int log2_denom,
1905 int weight_src, int offset)
1906 {
1907 if (4 == height) {
1908 avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset);
1909 } else if (8 == height) {
1910 avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset);
1911 } else {
1912 avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset);
1913 }
1914 }
1915
avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)1916 static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride,
1917 int32_t log2_denom, int32_t weight_src,
1918 int32_t offset_in)
1919 {
1920 uint32_t offset_val;
1921 __m256i wgt, zero = __lasx_xvldi(0);
1922 __m256i src0, tmp0, tmp1, denom, offset;
1923
1924 offset_val = (unsigned) offset_in << log2_denom;
1925
1926 wgt = __lasx_xvreplgr2vr_h(weight_src);
1927 offset = __lasx_xvreplgr2vr_h(offset_val);
1928 denom = __lasx_xvreplgr2vr_h(log2_denom);
1929
1930 DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1931 src0 = __lasx_xvilvl_w(tmp1, tmp0);
1932 src0 = __lasx_xvilvl_b(zero, src0);
1933 src0 = __lasx_xvmul_h(wgt, src0);
1934 src0 = __lasx_xvsadd_h(src0, offset);
1935 src0 = __lasx_xvmaxi_h(src0, 0);
1936 src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1937 __lasx_xvstelm_w(src0, src, 0, 0);
1938 __lasx_xvstelm_w(src0, src + stride, 0, 1);
1939 }
1940
avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)1941 static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride,
1942 int32_t log2_denom, int32_t weight_src,
1943 int32_t offset_in)
1944 {
1945 __m256i wgt;
1946 __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset;
1947 uint32_t offset_val;
1948 ptrdiff_t stride_2x = stride << 1;
1949 ptrdiff_t stride_3x = stride_2x + stride;
1950
1951 offset_val = (unsigned) offset_in << log2_denom;
1952
1953 wgt = __lasx_xvreplgr2vr_h(weight_src);
1954 offset = __lasx_xvreplgr2vr_h(offset_val);
1955 denom = __lasx_xvreplgr2vr_h(log2_denom);
1956
1957 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1958 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1959 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1960 src0 = __lasx_xvilvl_w(tmp1, tmp0);
1961 src0 = __lasx_vext2xv_hu_bu(src0);
1962 src0 = __lasx_xvmul_h(wgt, src0);
1963 src0 = __lasx_xvsadd_h(src0, offset);
1964 src0 = __lasx_xvmaxi_h(src0, 0);
1965 src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1966 __lasx_xvstelm_w(src0, src, 0, 0);
1967 __lasx_xvstelm_w(src0, src + stride, 0, 1);
1968 __lasx_xvstelm_w(src0, src + stride_2x, 0, 4);
1969 __lasx_xvstelm_w(src0, src + stride_3x, 0, 5);
1970 }
1971
avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)1972 static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride,
1973 int32_t log2_denom, int32_t weight_src,
1974 int32_t offset_in)
1975 {
1976 __m256i src0, src0_h, src0_l;
1977 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1978 __m256i wgt, zero = __lasx_xvldi(0);
1979 uint32_t offset_val;
1980 ptrdiff_t stride_2x = stride << 1;
1981 ptrdiff_t stride_4x = stride << 2;
1982 ptrdiff_t stride_3x = stride_2x + stride;
1983
1984 offset_val = (unsigned) offset_in << log2_denom;
1985
1986 wgt = __lasx_xvreplgr2vr_h(weight_src);
1987 offset = __lasx_xvreplgr2vr_h(offset_val);
1988 denom = __lasx_xvreplgr2vr_h(log2_denom);
1989
1990 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1991 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1992 src += stride_4x;
1993 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1994 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1995 src -= stride_4x;
1996 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
1997 tmp5, tmp0, tmp1, tmp2, tmp3);
1998 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1999 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
2000 src0_l = __lasx_xvilvl_b(zero, src0);
2001 src0_h = __lasx_xvilvh_b(zero, src0);
2002 src0_l = __lasx_xvmul_h(wgt, src0_l);
2003 src0_h = __lasx_xvmul_h(wgt, src0_h);
2004 src0_l = __lasx_xvsadd_h(src0_l, offset);
2005 src0_h = __lasx_xvsadd_h(src0_h, offset);
2006 src0_l = __lasx_xvmaxi_h(src0_l, 0);
2007 src0_h = __lasx_xvmaxi_h(src0_h, 0);
2008 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
2009 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
2010 __lasx_xvstelm_w(src0_l, src, 0, 0);
2011 __lasx_xvstelm_w(src0_l, src + stride, 0, 1);
2012 __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0);
2013 __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1);
2014 src += stride_4x;
2015 __lasx_xvstelm_w(src0_l, src, 0, 4);
2016 __lasx_xvstelm_w(src0_l, src + stride, 0, 5);
2017 __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4);
2018 __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5);
2019 }
2020
ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)2021 void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
2022 int height, int log2_denom,
2023 int weight_src, int offset)
2024 {
2025 if (2 == height) {
2026 avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset);
2027 } else if (4 == height) {
2028 avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset);
2029 } else {
2030 avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset);
2031 }
2032 }
2033
ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)2034 void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2035 {
2036 __m256i src0, dst0, dst1, dst2, dst3, zero;
2037 __m256i tmp0, tmp1;
2038 uint8_t* _dst1 = _dst + stride;
2039 uint8_t* _dst2 = _dst1 + stride;
2040 uint8_t* _dst3 = _dst2 + stride;
2041
2042 src0 = __lasx_xvld(_src, 0);
2043 dst0 = __lasx_xvldrepl_w(_dst, 0);
2044 dst1 = __lasx_xvldrepl_w(_dst1, 0);
2045 dst2 = __lasx_xvldrepl_w(_dst2, 0);
2046 dst3 = __lasx_xvldrepl_w(_dst3, 0);
2047 tmp0 = __lasx_xvilvl_w(dst1, dst0);
2048 tmp1 = __lasx_xvilvl_w(dst3, dst2);
2049 dst0 = __lasx_xvilvl_d(tmp1, tmp0);
2050 tmp0 = __lasx_vext2xv_hu_bu(dst0);
2051 zero = __lasx_xvldi(0);
2052 tmp1 = __lasx_xvadd_h(src0, tmp0);
2053 dst0 = __lasx_xvpickev_b(tmp1, tmp1);
2054 __lasx_xvstelm_w(dst0, _dst, 0, 0);
2055 __lasx_xvstelm_w(dst0, _dst1, 0, 1);
2056 __lasx_xvstelm_w(dst0, _dst2, 0, 4);
2057 __lasx_xvstelm_w(dst0, _dst3, 0, 5);
2058 __lasx_xvst(zero, _src, 0);
2059 }
2060
ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)2061 void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2062 {
2063 __m256i src0, src1, src2, src3;
2064 __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2065 __m256i tmp0, tmp1, tmp2, tmp3;
2066 __m256i zero = __lasx_xvldi(0);
2067 uint8_t *_dst1 = _dst + stride;
2068 uint8_t *_dst2 = _dst1 + stride;
2069 uint8_t *_dst3 = _dst2 + stride;
2070 uint8_t *_dst4 = _dst3 + stride;
2071 uint8_t *_dst5 = _dst4 + stride;
2072 uint8_t *_dst6 = _dst5 + stride;
2073 uint8_t *_dst7 = _dst6 + stride;
2074
2075 src0 = __lasx_xvld(_src, 0);
2076 src1 = __lasx_xvld(_src, 32);
2077 src2 = __lasx_xvld(_src, 64);
2078 src3 = __lasx_xvld(_src, 96);
2079 dst0 = __lasx_xvldrepl_d(_dst, 0);
2080 dst1 = __lasx_xvldrepl_d(_dst1, 0);
2081 dst2 = __lasx_xvldrepl_d(_dst2, 0);
2082 dst3 = __lasx_xvldrepl_d(_dst3, 0);
2083 dst4 = __lasx_xvldrepl_d(_dst4, 0);
2084 dst5 = __lasx_xvldrepl_d(_dst5, 0);
2085 dst6 = __lasx_xvldrepl_d(_dst6, 0);
2086 dst7 = __lasx_xvldrepl_d(_dst7, 0);
2087 tmp0 = __lasx_xvilvl_d(dst1, dst0);
2088 tmp1 = __lasx_xvilvl_d(dst3, dst2);
2089 tmp2 = __lasx_xvilvl_d(dst5, dst4);
2090 tmp3 = __lasx_xvilvl_d(dst7, dst6);
2091 dst0 = __lasx_vext2xv_hu_bu(tmp0);
2092 dst1 = __lasx_vext2xv_hu_bu(tmp1);
2093 dst1 = __lasx_vext2xv_hu_bu(tmp1);
2094 dst2 = __lasx_vext2xv_hu_bu(tmp2);
2095 dst3 = __lasx_vext2xv_hu_bu(tmp3);
2096 tmp0 = __lasx_xvadd_h(src0, dst0);
2097 tmp1 = __lasx_xvadd_h(src1, dst1);
2098 tmp2 = __lasx_xvadd_h(src2, dst2);
2099 tmp3 = __lasx_xvadd_h(src3, dst3);
2100 dst1 = __lasx_xvpickev_b(tmp1, tmp0);
2101 dst2 = __lasx_xvpickev_b(tmp3, tmp2);
2102 __lasx_xvst(zero, _src, 0);
2103 __lasx_xvst(zero, _src, 32);
2104 __lasx_xvst(zero, _src, 64);
2105 __lasx_xvst(zero, _src, 96);
2106 __lasx_xvstelm_d(dst1, _dst, 0, 0);
2107 __lasx_xvstelm_d(dst1, _dst1, 0, 2);
2108 __lasx_xvstelm_d(dst1, _dst2, 0, 1);
2109 __lasx_xvstelm_d(dst1, _dst3, 0, 3);
2110 __lasx_xvstelm_d(dst2, _dst4, 0, 0);
2111 __lasx_xvstelm_d(dst2, _dst5, 0, 2);
2112 __lasx_xvstelm_d(dst2, _dst6, 0, 1);
2113 __lasx_xvstelm_d(dst2, _dst7, 0, 3);
2114 }
2115