1 /*
2  * Loongson LASX optimized h264dsp
3  *
4  * Copyright (c) 2021 Loongson Technology Corporation Limited
5  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6  *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/loongarch/loongson_intrinsics.h"
26 #include "h264dsp_lasx.h"
27 
28 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,   \
29                          p1_or_q1_org_in, p2_or_q2_org_in,   \
30                          neg_tc_in, tc_in, p1_or_q1_out)     \
31 {                                                            \
32     __m256i clip3, temp;                                     \
33                                                              \
34     clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in,                \
35                              q0_or_p0_org_in);               \
36     temp = __lasx_xvslli_h(p1_or_q1_org_in, 1);              \
37     clip3 = __lasx_xvsub_h(clip3, temp);                     \
38     clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3);          \
39     clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in);        \
40     p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3);   \
41 }
42 
43 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,       \
44                      p1_or_q1_org_in, q1_or_p1_org_in,       \
45                      neg_threshold_in, threshold_in,         \
46                      p0_or_q0_out, q0_or_p0_out)             \
47 {                                                            \
48     __m256i q0_sub_p0, p1_sub_q1, delta;                     \
49                                                              \
50     q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in,              \
51                                p0_or_q0_org_in);             \
52     p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in,              \
53                                q1_or_p1_org_in);             \
54     q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2);               \
55     p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4);              \
56     delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1);            \
57     delta = __lasx_xvsrai_h(delta, 3);                       \
58     delta = __lasx_xvclip_h(delta, neg_threshold_in,         \
59            threshold_in);                                    \
60     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta);   \
61     q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta);   \
62                                                              \
63     p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out);         \
64     q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out);         \
65 }
66 
ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)67 void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
68                                int alpha_in, int beta_in, int8_t *tc)
69 {
70     ptrdiff_t img_width_2x = img_width << 1;
71     ptrdiff_t img_width_4x = img_width << 2;
72     ptrdiff_t img_width_8x = img_width << 3;
73     ptrdiff_t img_width_3x = img_width_2x + img_width;
74     __m256i tmp_vec0, bs_vec;
75     __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
76                       0x0101010100000000, 0x0303030302020202};
77 
78     tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
79     tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
80     bs_vec   = __lasx_xvslti_b(tc_vec, 0);
81     bs_vec   = __lasx_xvxori_b(bs_vec, 255);
82     bs_vec   = __lasx_xvandi_b(bs_vec, 1);
83 
84     if (__lasx_xbnz_v(bs_vec)) {
85         uint8_t *src = data - 4;
86         __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
87         __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
88         __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
89         __m256i is_bs_greater_than0;
90         __m256i zero = __lasx_xvldi(0);
91 
92         is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
93 
94         {
95             uint8_t *src_tmp = src + img_width_8x;
96             __m256i row0, row1, row2, row3, row4, row5, row6, row7;
97             __m256i row8, row9, row10, row11, row12, row13, row14, row15;
98 
99             DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
100                       src, img_width_3x, row0, row1, row2, row3);
101             src += img_width_4x;
102             DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
103                       src, img_width_3x, row4, row5, row6, row7);
104             src -= img_width_4x;
105             DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
106                       img_width_2x, src_tmp, img_width_3x,
107                       row8, row9, row10, row11);
108             src_tmp += img_width_4x;
109             DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
110                       img_width_2x, src_tmp, img_width_3x,
111                       row12, row13, row14, row15);
112             src_tmp -= img_width_4x;
113 
114             LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
115                                  row7, row8, row9, row10, row11,
116                                  row12, row13, row14, row15,
117                                  p3_org, p2_org, p1_org, p0_org,
118                                  q0_org, q1_org, q2_org, q3_org);
119         }
120 
121         p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
122         p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
123         q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
124 
125         alpha = __lasx_xvreplgr2vr_b(alpha_in);
126         beta  = __lasx_xvreplgr2vr_b(beta_in);
127 
128         is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
129         is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
130         is_less_than       = is_less_than_alpha & is_less_than_beta;
131         is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
132         is_less_than       = is_less_than_beta & is_less_than;
133         is_less_than       = is_less_than & is_bs_greater_than0;
134 
135         if (__lasx_xbnz_v(is_less_than)) {
136             __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
137             __m256i p2_asub_p0, q2_asub_q0;
138 
139             neg_tc_h = __lasx_xvneg_b(tc_vec);
140             neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
141             tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
142             p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
143             p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
144             q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
145 
146             p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
147             is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
148             is_less_than_beta = is_less_than_beta & is_less_than;
149 
150             if (__lasx_xbnz_v(is_less_than_beta)) {
151                 __m256i p2_org_h, p1_h;
152 
153                 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
154                 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
155                                  neg_tc_h, tc_h, p1_h);
156                 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
157                 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
158                 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
159                 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
160                 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
161             }
162 
163             q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
164             is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
165             is_less_than_beta = is_less_than_beta & is_less_than;
166 
167             q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
168 
169             if (__lasx_xbnz_v(is_less_than_beta)) {
170                 __m256i q2_org_h, q1_h;
171 
172                 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
173                 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
174                                  neg_tc_h, tc_h, q1_h);
175                 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
176                 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
177                 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
178 
179                 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
180                 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
181             }
182 
183             {
184                 __m256i neg_thresh_h, p0_h, q0_h;
185 
186                 neg_thresh_h = __lasx_xvneg_b(tc_vec);
187                 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
188                 tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
189 
190                 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
191                              neg_thresh_h, tc_h, p0_h, q0_h);
192                 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
193                           p0_h, q0_h);
194                 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
195                           p0_h, q0_h);
196                 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
197                 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
198             }
199 
200             {
201                 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
202                 __m256i control = {0x0000000400000000, 0x0000000500000001,
203                                    0x0000000600000002, 0x0000000700000003};
204 
205                 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
206                           q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
207                           q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
208                 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
209                           row0, row2);
210                 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
211                           row1, row3);
212                 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
213                 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
214                 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
215                           control, row7, control, row4, row5, row6, row7);
216                 __lasx_xvstelm_d(row4, src, 0, 0);
217                 __lasx_xvstelm_d(row4, src + img_width, 0, 1);
218                 src += img_width_2x;
219                 __lasx_xvstelm_d(row4, src, 0, 2);
220                 __lasx_xvstelm_d(row4, src + img_width, 0, 3);
221                 src += img_width_2x;
222                 __lasx_xvstelm_d(row5, src, 0, 0);
223                 __lasx_xvstelm_d(row5, src + img_width, 0, 1);
224                 src += img_width_2x;
225                 __lasx_xvstelm_d(row5, src, 0, 2);
226                 __lasx_xvstelm_d(row5, src + img_width, 0, 3);
227                 src += img_width_2x;
228                 __lasx_xvstelm_d(row6, src, 0, 0);
229                 __lasx_xvstelm_d(row6, src + img_width, 0, 1);
230                 src += img_width_2x;
231                 __lasx_xvstelm_d(row6, src, 0, 2);
232                 __lasx_xvstelm_d(row6, src + img_width, 0, 3);
233                 src += img_width_2x;
234                 __lasx_xvstelm_d(row7, src, 0, 0);
235                 __lasx_xvstelm_d(row7, src + img_width, 0, 1);
236                 src += img_width_2x;
237                 __lasx_xvstelm_d(row7, src, 0, 2);
238                 __lasx_xvstelm_d(row7, src + img_width, 0, 3);
239             }
240         }
241     }
242 }
243 
ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)244 void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
245                                    int alpha_in, int beta_in, int8_t *tc)
246 {
247     ptrdiff_t img_width_2x = img_width << 1;
248     ptrdiff_t img_width_3x = img_width + img_width_2x;
249     __m256i tmp_vec0, bs_vec;
250     __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
251                       0x0101010100000000, 0x0303030302020202};
252 
253     tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
254     tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
255     bs_vec   = __lasx_xvslti_b(tc_vec, 0);
256     bs_vec   = __lasx_xvxori_b(bs_vec, 255);
257     bs_vec   = __lasx_xvandi_b(bs_vec, 1);
258 
259     if (__lasx_xbnz_v(bs_vec)) {
260         __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
261         __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
262         __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
263         __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
264         __m256i is_bs_greater_than0;
265         __m256i zero = __lasx_xvldi(0);
266 
267         alpha = __lasx_xvreplgr2vr_b(alpha_in);
268         beta  = __lasx_xvreplgr2vr_b(beta_in);
269 
270         DUP2_ARG2(__lasx_xvldx, data, -img_width_3x, data, -img_width_2x,
271                   p2_org, p1_org);
272         p0_org = __lasx_xvldx(data, -img_width);
273         DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
274 
275         is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
276         p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
277         p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
278         q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
279 
280         is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
281         is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
282         is_less_than       = is_less_than_alpha & is_less_than_beta;
283         is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
284         is_less_than       = is_less_than_beta & is_less_than;
285         is_less_than       = is_less_than & is_bs_greater_than0;
286 
287         if (__lasx_xbnz_v(is_less_than)) {
288             __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
289 
290             q2_org = __lasx_xvldx(data, img_width_2x);
291 
292             neg_tc_h = __lasx_xvneg_b(tc_vec);
293             neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
294             tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
295             p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
296             p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
297             q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
298 
299             p2_asub_p0        = __lasx_xvabsd_bu(p2_org, p0_org);
300             is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
301             is_less_than_beta = is_less_than_beta & is_less_than;
302 
303             if (__lasx_xbnz_v(is_less_than_beta)) {
304                 __m256i p1_h, p2_org_h;
305 
306                 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
307                 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
308                                  neg_tc_h, tc_h, p1_h);
309                 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
310                 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
311                 p1_h   = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
312                 p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
313                 __lasx_xvst(p1_org, data - img_width_2x, 0);
314 
315                 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
316                 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
317             }
318 
319             q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
320             is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
321             is_less_than_beta = is_less_than_beta & is_less_than;
322 
323             q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
324 
325             if (__lasx_xbnz_v(is_less_than_beta)) {
326                 __m256i q1_h, q2_org_h;
327 
328                 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
329                 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
330                                  neg_tc_h, tc_h, q1_h);
331                 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
332                 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
333                 q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
334                 q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
335                 __lasx_xvst(q1_org, data + img_width, 0);
336 
337                 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
338                 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
339 
340             }
341 
342             {
343                 __m256i neg_thresh_h, p0_h, q0_h;
344 
345                 neg_thresh_h = __lasx_xvneg_b(tc_vec);
346                 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
347                 tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
348 
349                 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
350                              neg_thresh_h, tc_h, p0_h, q0_h);
351                 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
352                           p0_h, q0_h);
353                 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
354                           p0_h, q0_h);
355                 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
356                 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
357                 p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
358                 q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
359                 __lasx_xvst(p0_org, data - img_width, 0);
360                 __lasx_xvst(q0_org, data, 0);
361             }
362         }
363     }
364 }
365 
ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)366 void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
367                                  int alpha_in, int beta_in, int8_t *tc)
368 {
369     __m256i tmp_vec0, bs_vec;
370     __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
371     __m256i zero = __lasx_xvldi(0);
372     ptrdiff_t img_width_2x = img_width << 1;
373     ptrdiff_t img_width_4x = img_width << 2;
374     ptrdiff_t img_width_3x = img_width_2x + img_width;
375 
376     tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
377     tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
378     bs_vec   = __lasx_xvslti_b(tc_vec, 0);
379     bs_vec   = __lasx_xvxori_b(bs_vec, 255);
380     bs_vec   = __lasx_xvandi_b(bs_vec, 1);
381     bs_vec   = __lasx_xvpermi_q(zero, bs_vec, 0x30);
382 
383     if (__lasx_xbnz_v(bs_vec)) {
384         uint8_t *src = data - 2;
385         __m256i p1_org, p0_org, q0_org, q1_org;
386         __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
387         __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
388         __m256i is_bs_greater_than0;
389 
390         is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
391 
392         {
393             __m256i row0, row1, row2, row3, row4, row5, row6, row7;
394 
395             DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
396                       src, img_width_3x, row0, row1, row2, row3);
397             src += img_width_4x;
398             DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
399                       src, img_width_3x, row4, row5, row6, row7);
400             src -= img_width_4x;
401             /* LASX_TRANSPOSE8x4_B */
402             DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
403                       row7, row5, p1_org, p0_org, q0_org, q1_org);
404             row0 = __lasx_xvilvl_b(p0_org, p1_org);
405             row1 = __lasx_xvilvl_b(q1_org, q0_org);
406             row3 = __lasx_xvilvh_w(row1, row0);
407             row2 = __lasx_xvilvl_w(row1, row0);
408             p1_org = __lasx_xvpermi_d(row2, 0x00);
409             p0_org = __lasx_xvpermi_d(row2, 0x55);
410             q0_org = __lasx_xvpermi_d(row3, 0x00);
411             q1_org = __lasx_xvpermi_d(row3, 0x55);
412         }
413 
414         p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
415         p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
416         q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
417 
418         alpha = __lasx_xvreplgr2vr_b(alpha_in);
419         beta  = __lasx_xvreplgr2vr_b(beta_in);
420 
421         is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
422         is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
423         is_less_than       = is_less_than_alpha & is_less_than_beta;
424         is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
425         is_less_than       = is_less_than_beta & is_less_than;
426         is_less_than       = is_less_than & is_bs_greater_than0;
427 
428         if (__lasx_xbnz_v(is_less_than)) {
429             __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
430 
431             p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
432             p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
433             q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
434             q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
435 
436             {
437                 __m256i tc_h, neg_thresh_h, p0_h, q0_h;
438 
439                 neg_thresh_h = __lasx_xvneg_b(tc_vec);
440                 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
441                 tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
442 
443                 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
444                              neg_thresh_h, tc_h, p0_h, q0_h);
445                 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
446                           p0_h, q0_h);
447                 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
448                           p0_h, q0_h);
449                 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
450                 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
451             }
452 
453             p0_org = __lasx_xvilvl_b(q0_org, p0_org);
454             src = data - 1;
455             __lasx_xvstelm_h(p0_org, src, 0, 0);
456             src += img_width;
457             __lasx_xvstelm_h(p0_org, src, 0, 1);
458             src += img_width;
459             __lasx_xvstelm_h(p0_org, src, 0, 2);
460             src += img_width;
461             __lasx_xvstelm_h(p0_org, src, 0, 3);
462             src += img_width;
463             __lasx_xvstelm_h(p0_org, src, 0, 4);
464             src += img_width;
465             __lasx_xvstelm_h(p0_org, src, 0, 5);
466             src += img_width;
467             __lasx_xvstelm_h(p0_org, src, 0, 6);
468             src += img_width;
469             __lasx_xvstelm_h(p0_org, src, 0, 7);
470         }
471     }
472 }
473 
ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)474 void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
475                                  int alpha_in, int beta_in, int8_t *tc)
476 {
477     int img_width_2x = img_width << 1;
478     __m256i tmp_vec0, bs_vec;
479     __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
480     __m256i zero = __lasx_xvldi(0);
481 
482     tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
483     tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
484     bs_vec   = __lasx_xvslti_b(tc_vec, 0);
485     bs_vec   = __lasx_xvxori_b(bs_vec, 255);
486     bs_vec   = __lasx_xvandi_b(bs_vec, 1);
487     bs_vec   = __lasx_xvpermi_q(zero, bs_vec, 0x30);
488 
489     if (__lasx_xbnz_v(bs_vec)) {
490         __m256i p1_org, p0_org, q0_org, q1_org;
491         __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
492         __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
493         __m256i is_bs_greater_than0;
494 
495         alpha = __lasx_xvreplgr2vr_b(alpha_in);
496         beta  = __lasx_xvreplgr2vr_b(beta_in);
497 
498         DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width,
499                   p1_org, p0_org);
500         DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
501 
502         is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
503         p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
504         p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
505         q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
506 
507         is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
508         is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
509         is_less_than       = is_less_than_alpha & is_less_than_beta;
510         is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
511         is_less_than       = is_less_than_beta & is_less_than;
512         is_less_than       = is_less_than & is_bs_greater_than0;
513 
514         if (__lasx_xbnz_v(is_less_than)) {
515             __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
516 
517             p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
518             p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
519             q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
520             q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
521 
522             {
523                 __m256i neg_thresh_h, tc_h, p0_h, q0_h;
524 
525                 neg_thresh_h = __lasx_xvneg_b(tc_vec);
526                 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
527                 tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
528 
529                 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
530                              neg_thresh_h, tc_h, p0_h, q0_h);
531                 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
532                           p0_h, q0_h);
533                 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
534                           p0_h, q0_h);
535                 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
536                 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
537                 __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
538                 __lasx_xvstelm_d(q0_h, data, 0, 0);
539             }
540         }
541     }
542 }
543 
544 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
545                                  q3_or_p3_org_in, p1_or_q1_org_in,          \
546                                  p2_or_q2_org_in, q1_or_p1_org_in,          \
547                                  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
548 {                                                                           \
549     __m256i threshold;                                                      \
550     __m256i const2, const3 = __lasx_xvldi(0);                               \
551                                                                             \
552     const2 = __lasx_xvaddi_hu(const3, 2);                                   \
553     const3 = __lasx_xvaddi_hu(const3, 3);                                   \
554     threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in);           \
555     threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold);                 \
556                                                                             \
557     p0_or_q0_out = __lasx_xvslli_h(threshold, 1);                           \
558     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in);           \
559     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in);           \
560     p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3);                   \
561                                                                             \
562     p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold);              \
563     p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2);                   \
564                                                                             \
565     p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3);                 \
566     p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \
567     p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \
568     p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold);                 \
569     p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3);                   \
570 }
571 
572 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
573 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,             \
574                          p1_or_q1_org_in, p0_or_q0_out)                \
575 {                                                                      \
576     __m256i const2 = __lasx_xvldi(0);                                  \
577     const2 = __lasx_xvaddi_hu(const2, 2);                              \
578     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in);   \
579     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \
580     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \
581     p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2);              \
582 }
583 
ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)584 void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
585                                      int alpha_in, int beta_in)
586 {
587     ptrdiff_t img_width_2x = img_width << 1;
588     ptrdiff_t img_width_4x = img_width << 2;
589     ptrdiff_t img_width_3x = img_width_2x + img_width;
590     uint8_t *src = data - 4;
591     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
592     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
593     __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
594     __m256i zero = __lasx_xvldi(0);
595 
596     {
597         __m256i row0, row1, row2, row3, row4, row5, row6, row7;
598         __m256i row8, row9, row10, row11, row12, row13, row14, row15;
599 
600         DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
601                   src, img_width_3x, row0, row1, row2, row3);
602         src += img_width_4x;
603         DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
604                   src, img_width_3x, row4, row5, row6, row7);
605         src += img_width_4x;
606         DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
607                   src, img_width_3x, row8, row9, row10, row11);
608         src += img_width_4x;
609         DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
610                   src, img_width_3x, row12, row13, row14, row15);
611         src += img_width_4x;
612 
613         LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
614                              row4, row5, row6, row7,
615                              row8, row9, row10, row11,
616                              row12, row13, row14, row15,
617                              p3_org, p2_org, p1_org, p0_org,
618                              q0_org, q1_org, q2_org, q3_org);
619     }
620 
621     alpha = __lasx_xvreplgr2vr_b(alpha_in);
622     beta  = __lasx_xvreplgr2vr_b(beta_in);
623     p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
624     p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
625     q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
626 
627     is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
628     is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
629     is_less_than       = is_less_than_beta & is_less_than_alpha;
630     is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
631     is_less_than       = is_less_than_beta & is_less_than;
632     is_less_than       = __lasx_xvpermi_q(zero, is_less_than, 0x30);
633 
634     if (__lasx_xbnz_v(is_less_than)) {
635         __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
636         __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
637         __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
638 
639         less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
640         less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
641                                                  less_alpha_shift2_add2);
642 
643         p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
644         p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
645         q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
646         q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
647 
648         p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
649         is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
650         is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
651         negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
652         is_less_than_beta        = is_less_than_beta & is_less_than;
653         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
654 
655         /* combine and store */
656         if (__lasx_xbnz_v(is_less_than_beta)) {
657             __m256i p2_org_h, p3_org_h, p1_h, p2_h;
658 
659             p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
660             p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
661 
662             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
663                                      p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
664 
665             p0_h = __lasx_xvpickev_b(p0_h, p0_h);
666             p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
667             DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
668             DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
669             p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
670             p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
671             p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
672         }
673 
674         AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
675         /* combine */
676         p0_h = __lasx_xvpickev_b(p0_h, p0_h);
677         p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
678         p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
679 
680         /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
681         q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
682         is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
683         is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
684         negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
685         is_less_than_beta = is_less_than_beta & is_less_than;
686         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
687 
688         /* combine and store */
689         if (__lasx_xbnz_v(is_less_than_beta)) {
690             __m256i q2_org_h, q3_org_h, q1_h, q2_h;
691 
692             q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
693             q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
694 
695             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
696                                      q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
697 
698             q0_h = __lasx_xvpickev_b(q0_h, q0_h);
699             q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
700             DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
701             DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
702             q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
703             q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
704             q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
705 
706         }
707 
708         AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
709 
710         /* combine */
711         q0_h = __lasx_xvpickev_b(q0_h, q0_h);
712         q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
713         q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
714 
715         /* transpose and store */
716         {
717             __m256i row0, row1, row2, row3, row4, row5, row6, row7;
718             __m256i control = {0x0000000400000000, 0x0000000500000001,
719                                0x0000000600000002, 0x0000000700000003};
720 
721             DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
722                       0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
723                       p0_org, p1_org, p2_org, p3_org);
724             DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
725                       row0, row2);
726             DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
727                       row1, row3);
728             DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
729             DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
730             DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
731                       control, row7, control, row4, row5, row6, row7);
732             src = data - 4;
733             __lasx_xvstelm_d(row4, src, 0, 0);
734             __lasx_xvstelm_d(row4, src + img_width, 0, 1);
735             src += img_width_2x;
736             __lasx_xvstelm_d(row4, src, 0, 2);
737             __lasx_xvstelm_d(row4, src + img_width, 0, 3);
738             src += img_width_2x;
739             __lasx_xvstelm_d(row5, src, 0, 0);
740             __lasx_xvstelm_d(row5, src + img_width, 0, 1);
741             src += img_width_2x;
742             __lasx_xvstelm_d(row5, src, 0, 2);
743             __lasx_xvstelm_d(row5, src + img_width, 0, 3);
744             src += img_width_2x;
745             __lasx_xvstelm_d(row6, src, 0, 0);
746             __lasx_xvstelm_d(row6, src + img_width, 0, 1);
747             src += img_width_2x;
748             __lasx_xvstelm_d(row6, src, 0, 2);
749             __lasx_xvstelm_d(row6, src + img_width, 0, 3);
750             src += img_width_2x;
751             __lasx_xvstelm_d(row7, src, 0, 0);
752             __lasx_xvstelm_d(row7, src + img_width, 0, 1);
753             src += img_width_2x;
754             __lasx_xvstelm_d(row7, src, 0, 2);
755             __lasx_xvstelm_d(row7, src + img_width, 0, 3);
756         }
757     }
758 }
759 
ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)760 void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
761                                      int alpha_in, int beta_in)
762 {
763     ptrdiff_t img_width_2x = img_width << 1;
764     ptrdiff_t img_width_3x = img_width_2x + img_width;
765     uint8_t *src = data - img_width_2x;
766     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
767     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
768     __m256i p1_org, p0_org, q0_org, q1_org;
769     __m256i zero = __lasx_xvldi(0);
770 
771     DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
772               src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
773     alpha = __lasx_xvreplgr2vr_b(alpha_in);
774     beta  = __lasx_xvreplgr2vr_b(beta_in);
775     p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
776     p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
777     q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
778 
779     is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
780     is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
781     is_less_than       = is_less_than_beta & is_less_than_alpha;
782     is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
783     is_less_than       = is_less_than_beta & is_less_than;
784     is_less_than       = __lasx_xvpermi_q(zero, is_less_than, 0x30);
785 
786     if (__lasx_xbnz_v(is_less_than)) {
787         __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
788         __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
789         __m256i p2_org = __lasx_xvldx(src, -img_width);
790         __m256i q2_org = __lasx_xvldx(data, img_width_2x);
791         __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
792         less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
793         less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
794                                                  less_alpha_shift2_add2);
795 
796         p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
797         p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
798         q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
799         q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
800 
801         p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
802         is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
803         is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
804         negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
805         is_less_than_beta        = is_less_than_beta & is_less_than;
806         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
807 
808         /* combine and store */
809         if (__lasx_xbnz_v(is_less_than_beta)) {
810             __m256i p2_org_h, p3_org_h, p1_h, p2_h;
811             __m256i p3_org = __lasx_xvldx(src, -img_width_2x);
812 
813             p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
814             p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
815 
816             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
817                                      p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
818 
819             p0_h = __lasx_xvpickev_b(p0_h, p0_h);
820             p0_h =  __lasx_xvpermi_d(p0_h, 0xd8);
821             DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
822             DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
823             p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
824             p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
825             p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
826 
827             __lasx_xvst(p1_org, src, 0);
828             __lasx_xvst(p2_org, src - img_width, 0);
829         }
830 
831         AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
832         /* combine */
833         p0_h = __lasx_xvpickev_b(p0_h, p0_h);
834         p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
835         p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
836         __lasx_xvst(p0_org, data - img_width, 0);
837 
838         /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
839         q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
840         is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
841         is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
842         negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
843         is_less_than_beta = is_less_than_beta & is_less_than;
844         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
845 
846         /* combine and store */
847         if (__lasx_xbnz_v(is_less_than_beta)) {
848             __m256i q2_org_h, q3_org_h, q1_h, q2_h;
849             __m256i q3_org = __lasx_xvldx(data, img_width_2x + img_width);
850 
851             q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
852             q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
853 
854             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
855                                      q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
856 
857             q0_h = __lasx_xvpickev_b(q0_h, q0_h);
858             q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
859             DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
860             DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
861             q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
862             q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
863             q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
864 
865             __lasx_xvst(q1_org, data + img_width, 0);
866             __lasx_xvst(q2_org, data + img_width_2x, 0);
867         }
868 
869         AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
870 
871         /* combine */
872         q0_h = __lasx_xvpickev_b(q0_h, q0_h);
873         q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
874         q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
875 
876         __lasx_xvst(q0_org, data, 0);
877     }
878 }
879 
ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)880 void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
881                                        int alpha_in, int beta_in)
882 {
883     uint8_t *src = data - 2;
884     ptrdiff_t img_width_2x = img_width << 1;
885     ptrdiff_t img_width_4x = img_width << 2;
886     ptrdiff_t img_width_3x = img_width_2x + img_width;
887     __m256i p1_org, p0_org, q0_org, q1_org;
888     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
889     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
890 
891     {
892         __m256i row0, row1, row2, row3, row4, row5, row6, row7;
893 
894         DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
895                   img_width_3x, row0, row1, row2, row3);
896         src += img_width_4x;
897         DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
898                   img_width_3x, row4, row5, row6, row7);
899 
900         /* LASX_TRANSPOSE8x4_B */
901         DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
902                   p1_org, p0_org, q0_org, q1_org);
903         row0 = __lasx_xvilvl_b(p0_org, p1_org);
904         row1 = __lasx_xvilvl_b(q1_org, q0_org);
905         row3 = __lasx_xvilvh_w(row1, row0);
906         row2 = __lasx_xvilvl_w(row1, row0);
907         p1_org = __lasx_xvpermi_d(row2, 0x00);
908         p0_org = __lasx_xvpermi_d(row2, 0x55);
909         q0_org = __lasx_xvpermi_d(row3, 0x00);
910         q1_org = __lasx_xvpermi_d(row3, 0x55);
911     }
912 
913     alpha = __lasx_xvreplgr2vr_b(alpha_in);
914     beta  = __lasx_xvreplgr2vr_b(beta_in);
915 
916     p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
917     p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
918     q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
919 
920     is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
921     is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
922     is_less_than       = is_less_than_alpha & is_less_than_beta;
923     is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
924     is_less_than       = is_less_than_beta & is_less_than;
925 
926     if (__lasx_xbnz_v(is_less_than)) {
927         __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
928 
929         p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
930         p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
931         q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
932         q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
933 
934         AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
935         AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
936         DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
937         DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
938         p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
939         q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
940     }
941     p0_org = __lasx_xvilvl_b(q0_org, p0_org);
942     src = data - 1;
943     __lasx_xvstelm_h(p0_org, src, 0, 0);
944     src += img_width;
945     __lasx_xvstelm_h(p0_org, src, 0, 1);
946     src += img_width;
947     __lasx_xvstelm_h(p0_org, src, 0, 2);
948     src += img_width;
949     __lasx_xvstelm_h(p0_org, src, 0, 3);
950     src += img_width;
951     __lasx_xvstelm_h(p0_org, src, 0, 4);
952     src += img_width;
953     __lasx_xvstelm_h(p0_org, src, 0, 5);
954     src += img_width;
955     __lasx_xvstelm_h(p0_org, src, 0, 6);
956     src += img_width;
957     __lasx_xvstelm_h(p0_org, src, 0, 7);
958 }
959 
ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)960 void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
961                                        int alpha_in, int beta_in)
962 {
963     ptrdiff_t img_width_2x = img_width << 1;
964     __m256i p1_org, p0_org, q0_org, q1_org;
965     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
966     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
967 
968     alpha = __lasx_xvreplgr2vr_b(alpha_in);
969     beta  = __lasx_xvreplgr2vr_b(beta_in);
970 
971     p1_org = __lasx_xvldx(data, -img_width_2x);
972     p0_org = __lasx_xvldx(data, -img_width);
973     DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
974 
975     p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
976     p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
977     q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
978 
979     is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
980     is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
981     is_less_than       = is_less_than_alpha & is_less_than_beta;
982     is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
983     is_less_than       = is_less_than_beta & is_less_than;
984 
985     if (__lasx_xbnz_v(is_less_than)) {
986         __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
987 
988         p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
989         p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
990         q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
991         q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
992 
993         AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
994         AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
995         DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
996         DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
997         p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
998         q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
999         __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
1000         __lasx_xvstelm_d(q0_h, data, 0, 0);
1001     }
1002 }
1003 
ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset_in)1004 void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
1005                                       ptrdiff_t stride, int height,
1006                                       int log2_denom, int weight_dst,
1007                                       int weight_src, int offset_in)
1008 {
1009     __m256i wgt;
1010     __m256i src0, src1, src2, src3;
1011     __m256i dst0, dst1, dst2, dst3;
1012     __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1013     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1014     __m256i denom, offset;
1015     int stride_2x = stride << 1;
1016     int stride_4x = stride << 2;
1017     int stride_3x = stride_2x + stride;
1018 
1019     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1020     offset_in  += ((weight_src + weight_dst) << 7);
1021     log2_denom += 1;
1022 
1023     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1024     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1025     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1026     offset = __lasx_xvreplgr2vr_h(offset_in);
1027     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1028 
1029     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1030               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1031     src += stride_4x;
1032     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1033               src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1034     src += stride_4x;
1035     DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1036               0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1037     DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1038               dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1039     dst += stride_4x;
1040     DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1041               dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1042     dst -= stride_4x;
1043     DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1044               0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1045 
1046     DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1047               src0, src1, src2, src3);
1048     DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1049               dst0, dst1, dst2, dst3);
1050     DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1051               dst3, src3, vec0, vec2, vec4, vec6);
1052     DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1053               dst3, src3, vec1, vec3, vec5, vec7);
1054 
1055     DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1056               offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1057     DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1058               offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1059 
1060     tmp0 = __lasx_xvsra_h(tmp0, denom);
1061     tmp1 = __lasx_xvsra_h(tmp1, denom);
1062     tmp2 = __lasx_xvsra_h(tmp2, denom);
1063     tmp3 = __lasx_xvsra_h(tmp3, denom);
1064     tmp4 = __lasx_xvsra_h(tmp4, denom);
1065     tmp5 = __lasx_xvsra_h(tmp5, denom);
1066     tmp6 = __lasx_xvsra_h(tmp6, denom);
1067     tmp7 = __lasx_xvsra_h(tmp7, denom);
1068 
1069     DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1070                                   tmp0, tmp1, tmp2, tmp3);
1071     DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1072                                   tmp4, tmp5, tmp6, tmp7);
1073     DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1074               dst0, dst1, dst2, dst3);
1075     __lasx_xvstelm_d(dst0, dst, 0, 0);
1076     __lasx_xvstelm_d(dst0, dst, 8, 1);
1077     dst += stride;
1078     __lasx_xvstelm_d(dst0, dst, 0, 2);
1079     __lasx_xvstelm_d(dst0, dst, 8, 3);
1080     dst += stride;
1081     __lasx_xvstelm_d(dst1, dst, 0, 0);
1082     __lasx_xvstelm_d(dst1, dst, 8, 1);
1083     dst += stride;
1084     __lasx_xvstelm_d(dst1, dst, 0, 2);
1085     __lasx_xvstelm_d(dst1, dst, 8, 3);
1086     dst += stride;
1087     __lasx_xvstelm_d(dst2, dst, 0, 0);
1088     __lasx_xvstelm_d(dst2, dst, 8, 1);
1089     dst += stride;
1090     __lasx_xvstelm_d(dst2, dst, 0, 2);
1091     __lasx_xvstelm_d(dst2, dst, 8, 3);
1092     dst += stride;
1093     __lasx_xvstelm_d(dst3, dst, 0, 0);
1094     __lasx_xvstelm_d(dst3, dst, 8, 1);
1095     dst += stride;
1096     __lasx_xvstelm_d(dst3, dst, 0, 2);
1097     __lasx_xvstelm_d(dst3, dst, 8, 3);
1098     dst += stride;
1099 
1100     if (16 == height) {
1101         DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1102                   src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1103         src += stride_4x;
1104         DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1105                   src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1106         src += stride_4x;
1107         DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1108                   tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1109         DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1110                   dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1111         dst += stride_4x;
1112         DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1113                   dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1114         dst -= stride_4x;
1115         DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1116                   tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1117 
1118         DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1119                   src0, src1, src2, src3);
1120         DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1121                   dst0, dst1, dst2, dst3);
1122         DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1123                   dst3, src3, vec0, vec2, vec4, vec6);
1124         DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1125                   dst3, src3, vec1, vec3, vec5, vec7);
1126 
1127         DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1128                   offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1129         DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1130                   offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1131 
1132         tmp0 = __lasx_xvsra_h(tmp0, denom);
1133         tmp1 = __lasx_xvsra_h(tmp1, denom);
1134         tmp2 = __lasx_xvsra_h(tmp2, denom);
1135         tmp3 = __lasx_xvsra_h(tmp3, denom);
1136         tmp4 = __lasx_xvsra_h(tmp4, denom);
1137         tmp5 = __lasx_xvsra_h(tmp5, denom);
1138         tmp6 = __lasx_xvsra_h(tmp6, denom);
1139         tmp7 = __lasx_xvsra_h(tmp7, denom);
1140 
1141         DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1142                                       tmp0, tmp1, tmp2, tmp3);
1143         DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1144                                       tmp4, tmp5, tmp6, tmp7);
1145         DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
1146                   tmp6, dst0, dst1, dst2, dst3);
1147         __lasx_xvstelm_d(dst0, dst, 0, 0);
1148         __lasx_xvstelm_d(dst0, dst, 8, 1);
1149         dst += stride;
1150         __lasx_xvstelm_d(dst0, dst, 0, 2);
1151         __lasx_xvstelm_d(dst0, dst, 8, 3);
1152         dst += stride;
1153         __lasx_xvstelm_d(dst1, dst, 0, 0);
1154         __lasx_xvstelm_d(dst1, dst, 8, 1);
1155         dst += stride;
1156         __lasx_xvstelm_d(dst1, dst, 0, 2);
1157         __lasx_xvstelm_d(dst1, dst, 8, 3);
1158         dst += stride;
1159         __lasx_xvstelm_d(dst2, dst, 0, 0);
1160         __lasx_xvstelm_d(dst2, dst, 8, 1);
1161         dst += stride;
1162         __lasx_xvstelm_d(dst2, dst, 0, 2);
1163         __lasx_xvstelm_d(dst2, dst, 8, 3);
1164         dst += stride;
1165         __lasx_xvstelm_d(dst3, dst, 0, 0);
1166         __lasx_xvstelm_d(dst3, dst, 8, 1);
1167         dst += stride;
1168         __lasx_xvstelm_d(dst3, dst, 0, 2);
1169         __lasx_xvstelm_d(dst3, dst, 8, 3);
1170     }
1171 }
1172 
avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1173 static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1174                                int32_t log2_denom, int32_t weight_src,
1175                                int32_t weight_dst, int32_t offset_in)
1176 {
1177     __m256i wgt, vec0, vec1;
1178     __m256i src0, dst0;
1179     __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1180     ptrdiff_t stride_2x = stride << 1;
1181     ptrdiff_t stride_3x = stride_2x + stride;
1182 
1183     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1184     offset_in  += ((weight_src + weight_dst) << 7);
1185     log2_denom += 1;
1186 
1187     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1188     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1189     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1190     offset = __lasx_xvreplgr2vr_h(offset_in);
1191     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1192 
1193     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1194               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1195     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1196     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1197     DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1198               dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1199     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1200     dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1201     DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1202     vec0 = __lasx_xvilvl_b(dst0, src0);
1203     vec1 = __lasx_xvilvh_b(dst0, src0);
1204     DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1205               tmp0, tmp1);
1206     tmp0 = __lasx_xvsra_h(tmp0, denom);
1207     tmp1 = __lasx_xvsra_h(tmp1, denom);
1208     DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1209     dst0 = __lasx_xvpickev_b(tmp1, tmp0);
1210     __lasx_xvstelm_d(dst0, dst, 0, 0);
1211     __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1212     __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1213     __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1214 }
1215 
avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1216 static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1217                                int32_t log2_denom, int32_t weight_src,
1218                                int32_t weight_dst, int32_t offset_in)
1219 {
1220     __m256i wgt, vec0, vec1, vec2, vec3;
1221     __m256i src0, src1, dst0, dst1;
1222     __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1223     ptrdiff_t stride_2x = stride << 1;
1224     ptrdiff_t stride_4x = stride << 2;
1225     ptrdiff_t stride_3x = stride_2x + stride;
1226     uint8_t* dst_tmp = dst;
1227 
1228     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1229     offset_in  += ((weight_src + weight_dst) << 7);
1230     log2_denom += 1;
1231 
1232     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1233     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1234     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1235     offset = __lasx_xvreplgr2vr_h(offset_in);
1236     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1237 
1238     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1239               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1240     src += stride_4x;
1241     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1242     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1243     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1244               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1245     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1246     src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1247     tmp0 = __lasx_xvld(dst_tmp, 0);
1248     DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2);
1249     tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
1250     dst_tmp += stride_4x;
1251     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1252     dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1253     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1254               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1255     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1256     dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1257 
1258     DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128,
1259               src0, src1, dst0, dst1);
1260     DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2);
1261     DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3);
1262     DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1263               offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1264     tmp0 = __lasx_xvsra_h(tmp0, denom);
1265     tmp1 = __lasx_xvsra_h(tmp1, denom);
1266     tmp2 = __lasx_xvsra_h(tmp2, denom);
1267     tmp3 = __lasx_xvsra_h(tmp3, denom);
1268     DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1269                                   tmp0, tmp1, tmp2, tmp3);
1270     DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
1271     __lasx_xvstelm_d(dst0, dst, 0, 0);
1272     __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1273     __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1274     __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1275     dst += stride_4x;
1276     __lasx_xvstelm_d(dst1, dst, 0, 0);
1277     __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1278     __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1279     __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1280 }
1281 
avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1282 static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1283                                 int32_t log2_denom, int32_t weight_src,
1284                                 int32_t weight_dst, int32_t offset_in)
1285 {
1286     __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1287     __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1288     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1289     ptrdiff_t stride_2x = stride << 1;
1290     ptrdiff_t stride_4x = stride << 2;
1291     ptrdiff_t stride_3x = stride_2x + stride;
1292     uint8_t* dst_tmp = dst;
1293 
1294     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1295     offset_in  += ((weight_src + weight_dst) << 7);
1296     log2_denom += 1;
1297 
1298     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1299     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1300     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1301     offset = __lasx_xvreplgr2vr_h(offset_in);
1302     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1303 
1304     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1305               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1306     src += stride_4x;
1307     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1308     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1309     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1310               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1311     src += stride_4x;
1312     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1313     src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1314     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1315               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1316     src += stride_4x;
1317     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1318     src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1319     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1320               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1321     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1322     src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1323 
1324     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1325               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1326     dst_tmp += stride_4x;
1327     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1328     dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1329     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1330               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1331     dst_tmp += stride_4x;
1332     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1333     dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1334     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1335               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1336     dst_tmp += stride_4x;
1337     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1338     dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1339     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1340               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1341     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1342     dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1343 
1344     DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1345               src0, src1, src2, src3);
1346     DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1347               dst0, dst1, dst2, dst3);
1348     DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1349               dst3, src3, vec0, vec2, vec4, vec6);
1350     DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1351               dst3, src3, vec1, vec3, vec5, vec7);
1352     DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1353               offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1354     DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5,
1355               offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1356     tmp0 = __lasx_xvsra_h(tmp0, denom);
1357     tmp1 = __lasx_xvsra_h(tmp1, denom);
1358     tmp2 = __lasx_xvsra_h(tmp2, denom);
1359     tmp3 = __lasx_xvsra_h(tmp3, denom);
1360     tmp4 = __lasx_xvsra_h(tmp4, denom);
1361     tmp5 = __lasx_xvsra_h(tmp5, denom);
1362     tmp6 = __lasx_xvsra_h(tmp6, denom);
1363     tmp7 = __lasx_xvsra_h(tmp7, denom);
1364     DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1365                                   tmp0, tmp1, tmp2, tmp3);
1366     DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1367                                   tmp4, tmp5, tmp6, tmp7);
1368     DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1369                    dst0, dst1, dst2, dst3)
1370     __lasx_xvstelm_d(dst0, dst, 0, 0);
1371     __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1372     __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1373     __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1374     dst += stride_4x;
1375     __lasx_xvstelm_d(dst1, dst, 0, 0);
1376     __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1377     __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1378     __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1379     dst += stride_4x;
1380     __lasx_xvstelm_d(dst2, dst, 0, 0);
1381     __lasx_xvstelm_d(dst2, dst + stride, 0, 1);
1382     __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
1383     __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
1384     dst += stride_4x;
1385     __lasx_xvstelm_d(dst3, dst, 0, 0);
1386     __lasx_xvstelm_d(dst3, dst + stride, 0, 1);
1387     __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
1388     __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
1389 }
1390 
ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)1391 void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
1392                                      ptrdiff_t stride, int height,
1393                                      int log2_denom, int weight_dst,
1394                                      int weight_src, int offset)
1395 {
1396     if (4 == height) {
1397         avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1398                            offset);
1399     } else if (8 == height) {
1400         avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1401                            offset);
1402     } else {
1403         avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1404                             offset);
1405     }
1406 }
1407 
avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1408 static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1409                                int32_t log2_denom, int32_t weight_src,
1410                                int32_t weight_dst, int32_t offset_in)
1411 {
1412     __m256i wgt, vec0;
1413     __m256i src0, dst0;
1414     __m256i tmp0, tmp1, denom, offset;
1415 
1416     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1417     offset_in  += ((weight_src + weight_dst) << 7);
1418     log2_denom += 1;
1419 
1420     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1421     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1422     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1423     offset = __lasx_xvreplgr2vr_h(offset_in);
1424     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1425 
1426     DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1427     src0 = __lasx_xvilvl_w(tmp1, tmp0);
1428     DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1);
1429     dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1430     DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1431     vec0 = __lasx_xvilvl_b(dst0, src0);
1432     tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1433     tmp0 = __lasx_xvsra_h(tmp0, denom);
1434     tmp0 = __lasx_xvclip255_h(tmp0);
1435     tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1436     __lasx_xvstelm_w(tmp0, dst, 0, 0);
1437     __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1438 }
1439 
avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1440 static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1441                                int32_t log2_denom, int32_t weight_src,
1442                                int32_t weight_dst, int32_t offset_in)
1443 {
1444     __m256i wgt, vec0;
1445     __m256i src0, dst0;
1446     __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1447     ptrdiff_t stride_2x = stride << 1;
1448     ptrdiff_t stride_3x = stride_2x + stride;
1449 
1450     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1451     offset_in  += ((weight_src + weight_dst) << 7);
1452     log2_denom += 1;
1453 
1454     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1455     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1456     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1457     offset = __lasx_xvreplgr2vr_h(offset_in);
1458     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1459 
1460     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1461               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1462     DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1463     src0 = __lasx_xvilvl_w(tmp1, tmp0);
1464     DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1465               dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1466     DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1467     dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1468     DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1469     vec0 = __lasx_xvilvl_b(dst0, src0);
1470     dst0 = __lasx_xvilvh_b(dst0, src0);
1471     vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
1472     tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1473     tmp0 = __lasx_xvsra_h(tmp0, denom);
1474     tmp0 = __lasx_xvclip255_h(tmp0);
1475     tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1476     __lasx_xvstelm_w(tmp0, dst, 0, 0);
1477     __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1478     __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
1479     __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
1480 }
1481 
avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)1482 static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1483                                int32_t log2_denom, int32_t weight_src,
1484                                int32_t weight_dst, int32_t offset_in)
1485 {
1486     __m256i wgt, vec0, vec1;
1487     __m256i src0, dst0;
1488     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1489     ptrdiff_t stride_2x = stride << 1;
1490     ptrdiff_t stride_4x = stride << 2;
1491     ptrdiff_t stride_3x = stride_2x + stride;
1492 
1493     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1494     offset_in  += ((weight_src + weight_dst) << 7);
1495     log2_denom += 1;
1496 
1497     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1498     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1499     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1500     offset = __lasx_xvreplgr2vr_h(offset_in);
1501     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1502 
1503     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1504               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1505     src += stride_4x;
1506     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1507               src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1508     DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1509               tmp0, tmp1, tmp2, tmp3);
1510     DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1511     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1512     DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1513               dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1514     dst += stride_4x;
1515     DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1516               dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1517     dst -= stride_4x;
1518     DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1519               tmp0, tmp1, tmp2, tmp3);
1520     DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1521     dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1522     DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1523     vec0 = __lasx_xvilvl_b(dst0, src0);
1524     vec1 = __lasx_xvilvh_b(dst0, src0);
1525     DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1526               tmp0, tmp1);
1527     tmp0 = __lasx_xvsra_h(tmp0, denom);
1528     tmp1 = __lasx_xvsra_h(tmp1, denom);
1529     DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1530     tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
1531     __lasx_xvstelm_w(tmp0, dst, 0, 0);
1532     __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1533     __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
1534     __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
1535     dst += stride_4x;
1536     __lasx_xvstelm_w(tmp0, dst, 0, 4);
1537     __lasx_xvstelm_w(tmp0, dst + stride, 0, 5);
1538     __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
1539     __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
1540 }
1541 
ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)1542 void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
1543                                      ptrdiff_t stride, int height,
1544                                      int log2_denom, int weight_dst,
1545                                      int weight_src, int offset)
1546 {
1547     if (2 == height) {
1548         avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src,
1549                            weight_dst, offset);
1550     } else if (4 == height) {
1551         avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src,
1552                            weight_dst, offset);
1553     } else {
1554         avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src,
1555                            weight_dst, offset);
1556     }
1557 }
1558 
ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset_in)1559 void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
1560                                     int height, int log2_denom,
1561                                     int weight_src, int offset_in)
1562 {
1563     uint32_t offset_val;
1564     ptrdiff_t stride_2x = stride << 1;
1565     ptrdiff_t stride_4x = stride << 2;
1566     ptrdiff_t stride_3x = stride_2x + stride;
1567     __m256i zero = __lasx_xvldi(0);
1568     __m256i src0, src1, src2, src3;
1569     __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
1570     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1571     __m256i wgt, denom, offset;
1572 
1573     offset_val = (unsigned) offset_in << log2_denom;
1574 
1575     wgt    = __lasx_xvreplgr2vr_h(weight_src);
1576     offset = __lasx_xvreplgr2vr_h(offset_val);
1577     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1578 
1579     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1580               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1581     src += stride_4x;
1582     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1583               src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1584     src -= stride_4x;
1585     DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1586               0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1587     DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1588               zero, src3, src0_l, src1_l, src2_l, src3_l);
1589     DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1590               zero, src3, src0_h, src1_h, src2_h, src3_h);
1591     src0_l = __lasx_xvmul_h(wgt, src0_l);
1592     src0_h = __lasx_xvmul_h(wgt, src0_h);
1593     src1_l = __lasx_xvmul_h(wgt, src1_l);
1594     src1_h = __lasx_xvmul_h(wgt, src1_h);
1595     src2_l = __lasx_xvmul_h(wgt, src2_l);
1596     src2_h = __lasx_xvmul_h(wgt, src2_h);
1597     src3_l = __lasx_xvmul_h(wgt, src3_l);
1598     src3_h = __lasx_xvmul_h(wgt, src3_h);
1599     DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1600               src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1601     DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1602               src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1603     src0_l = __lasx_xvmaxi_h(src0_l, 0);
1604     src0_h = __lasx_xvmaxi_h(src0_h, 0);
1605     src1_l = __lasx_xvmaxi_h(src1_l, 0);
1606     src1_h = __lasx_xvmaxi_h(src1_h, 0);
1607     src2_l = __lasx_xvmaxi_h(src2_l, 0);
1608     src2_h = __lasx_xvmaxi_h(src2_h, 0);
1609     src3_l = __lasx_xvmaxi_h(src3_l, 0);
1610     src3_h = __lasx_xvmaxi_h(src3_h, 0);
1611     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1612     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1613     src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1614     src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1615     src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1616     src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1617     src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1618     src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1619     __lasx_xvstelm_d(src0_l, src, 0, 0);
1620     __lasx_xvstelm_d(src0_h, src, 8, 0);
1621     src += stride;
1622     __lasx_xvstelm_d(src0_l, src, 0, 2);
1623     __lasx_xvstelm_d(src0_h, src, 8, 2);
1624     src += stride;
1625     __lasx_xvstelm_d(src1_l, src, 0, 0);
1626     __lasx_xvstelm_d(src1_h, src, 8, 0);
1627     src += stride;
1628     __lasx_xvstelm_d(src1_l, src, 0, 2);
1629     __lasx_xvstelm_d(src1_h, src, 8, 2);
1630     src += stride;
1631     __lasx_xvstelm_d(src2_l, src, 0, 0);
1632     __lasx_xvstelm_d(src2_h, src, 8, 0);
1633     src += stride;
1634     __lasx_xvstelm_d(src2_l, src, 0, 2);
1635     __lasx_xvstelm_d(src2_h, src, 8, 2);
1636     src += stride;
1637     __lasx_xvstelm_d(src3_l, src, 0, 0);
1638     __lasx_xvstelm_d(src3_h, src, 8, 0);
1639     src += stride;
1640     __lasx_xvstelm_d(src3_l, src, 0, 2);
1641     __lasx_xvstelm_d(src3_h, src, 8, 2);
1642     src += stride;
1643 
1644     if (16 == height) {
1645         DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1646                   src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1647         src += stride_4x;
1648         DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1649                   src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1650         src -= stride_4x;
1651         DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1652                   tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1653         DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1654                   zero, src3, src0_l, src1_l, src2_l, src3_l);
1655         DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1656                   zero, src3, src0_h, src1_h, src2_h, src3_h);
1657         src0_l = __lasx_xvmul_h(wgt, src0_l);
1658         src0_h = __lasx_xvmul_h(wgt, src0_h);
1659         src1_l = __lasx_xvmul_h(wgt, src1_l);
1660         src1_h = __lasx_xvmul_h(wgt, src1_h);
1661         src2_l = __lasx_xvmul_h(wgt, src2_l);
1662         src2_h = __lasx_xvmul_h(wgt, src2_h);
1663         src3_l = __lasx_xvmul_h(wgt, src3_l);
1664         src3_h = __lasx_xvmul_h(wgt, src3_h);
1665         DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l,
1666                   offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1667         DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l,
1668                   offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1669         src0_l = __lasx_xvmaxi_h(src0_l, 0);
1670         src0_h = __lasx_xvmaxi_h(src0_h, 0);
1671         src1_l = __lasx_xvmaxi_h(src1_l, 0);
1672         src1_h = __lasx_xvmaxi_h(src1_h, 0);
1673         src2_l = __lasx_xvmaxi_h(src2_l, 0);
1674         src2_h = __lasx_xvmaxi_h(src2_h, 0);
1675         src3_l = __lasx_xvmaxi_h(src3_l, 0);
1676         src3_h = __lasx_xvmaxi_h(src3_h, 0);
1677         src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1678         src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1679         src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1680         src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1681         src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1682         src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1683         src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1684         src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1685         __lasx_xvstelm_d(src0_l, src, 0, 0);
1686         __lasx_xvstelm_d(src0_h, src, 8, 0);
1687         src += stride;
1688         __lasx_xvstelm_d(src0_l, src, 0, 2);
1689         __lasx_xvstelm_d(src0_h, src, 8, 2);
1690         src += stride;
1691         __lasx_xvstelm_d(src1_l, src, 0, 0);
1692         __lasx_xvstelm_d(src1_h, src, 8, 0);
1693         src += stride;
1694         __lasx_xvstelm_d(src1_l, src, 0, 2);
1695         __lasx_xvstelm_d(src1_h, src, 8, 2);
1696         src += stride;
1697         __lasx_xvstelm_d(src2_l, src, 0, 0);
1698         __lasx_xvstelm_d(src2_h, src, 8, 0);
1699         src += stride;
1700         __lasx_xvstelm_d(src2_l, src, 0, 2);
1701         __lasx_xvstelm_d(src2_h, src, 8, 2);
1702         src += stride;
1703         __lasx_xvstelm_d(src3_l, src, 0, 0);
1704         __lasx_xvstelm_d(src3_h, src, 8, 0);
1705         src += stride;
1706         __lasx_xvstelm_d(src3_l, src, 0, 2);
1707         __lasx_xvstelm_d(src3_h, src, 8, 2);
1708     }
1709 }
1710 
avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)1711 static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride,
1712                              int32_t log2_denom, int32_t weight_src,
1713                              int32_t offset_in)
1714 {
1715     uint32_t offset_val;
1716     ptrdiff_t stride_2x = stride << 1;
1717     ptrdiff_t stride_3x = stride_2x + stride;
1718     __m256i wgt, zero = __lasx_xvldi(0);
1719     __m256i src0, src0_h, src0_l;
1720     __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1721 
1722     offset_val = (unsigned) offset_in << log2_denom;
1723 
1724     wgt    = __lasx_xvreplgr2vr_h(weight_src);
1725     offset = __lasx_xvreplgr2vr_h(offset_val);
1726     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1727 
1728     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1729               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1730     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1731     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1732     src0_l = __lasx_xvilvl_b(zero, src0);
1733     src0_h = __lasx_xvilvh_b(zero, src0);
1734     src0_l = __lasx_xvmul_h(wgt, src0_l);
1735     src0_h = __lasx_xvmul_h(wgt, src0_h);
1736     src0_l = __lasx_xvsadd_h(src0_l, offset);
1737     src0_h = __lasx_xvsadd_h(src0_h, offset);
1738     src0_l = __lasx_xvmaxi_h(src0_l, 0);
1739     src0_h = __lasx_xvmaxi_h(src0_h, 0);
1740     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1741     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1742 
1743     src0 = __lasx_xvpickev_d(src0_h, src0_l);
1744     __lasx_xvstelm_d(src0, src, 0, 0);
1745     __lasx_xvstelm_d(src0, src + stride, 0, 1);
1746     __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1747     __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1748 }
1749 
avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)1750 static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom,
1751                              int32_t src_weight, int32_t offset_in)
1752 {
1753     __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0);
1754     __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1755     uint32_t offset_val;
1756     uint8_t* src_tmp = src;
1757     ptrdiff_t stride_2x = stride << 1;
1758     ptrdiff_t stride_4x = stride << 2;
1759     ptrdiff_t stride_3x = stride_2x + stride;
1760 
1761     offset_val = (unsigned) offset_in << log2_denom;
1762 
1763     wgt    = __lasx_xvreplgr2vr_h(src_weight);
1764     offset = __lasx_xvreplgr2vr_h(offset_val);
1765     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1766 
1767     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1768               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1769     src_tmp += stride_4x;
1770     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1771     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1772     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1773               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1774     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1775     src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1776     DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l);
1777     DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h);
1778     src0_l = __lasx_xvmul_h(wgt, src0_l);
1779     src0_h = __lasx_xvmul_h(wgt, src0_h);
1780     src1_l = __lasx_xvmul_h(wgt, src1_l);
1781     src1_h = __lasx_xvmul_h(wgt, src1_h);
1782     DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1783               src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1784     src0_l = __lasx_xvmaxi_h(src0_l, 0);
1785     src0_h = __lasx_xvmaxi_h(src0_h, 0);
1786     src1_l = __lasx_xvmaxi_h(src1_l, 0);
1787     src1_h = __lasx_xvmaxi_h(src1_h, 0);
1788     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1789     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1790     src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1791     src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1792 
1793     DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1);
1794     __lasx_xvstelm_d(src0, src, 0, 0);
1795     __lasx_xvstelm_d(src0, src + stride, 0, 1);
1796     __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1797     __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1798     src += stride_4x;
1799     __lasx_xvstelm_d(src1, src, 0, 0);
1800     __lasx_xvstelm_d(src1, src + stride, 0, 1);
1801     __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1802     __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1803 }
1804 
avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)1805 static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride,
1806                               int32_t log2_denom, int32_t src_weight,
1807                               int32_t offset_in)
1808 {
1809     __m256i src0, src1, src2, src3;
1810     __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
1811     __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1812     __m256i zero = __lasx_xvldi(0);
1813     uint32_t offset_val;
1814     uint8_t* src_tmp = src;
1815     ptrdiff_t stride_2x = stride << 1;
1816     ptrdiff_t stride_4x = stride << 2;
1817     ptrdiff_t stride_3x = stride_2x + stride;
1818 
1819     offset_val = (unsigned) offset_in << log2_denom;
1820 
1821     wgt    = __lasx_xvreplgr2vr_h(src_weight);
1822     offset = __lasx_xvreplgr2vr_h(offset_val);
1823     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1824 
1825     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1826               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1827     src_tmp += stride_4x;
1828     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1829     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1830     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1831               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1832     src_tmp += stride_4x;
1833     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1834     src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1835     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1836               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1837     src_tmp += stride_4x;
1838     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1839     src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1840     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1841               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1842     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1843     src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1844 
1845     DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3,
1846               src0_l, src1_l, src2_l, src3_l);
1847     DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
1848               src0_h, src1_h, src2_h, src3_h);
1849     src0_l = __lasx_xvmul_h(wgt, src0_l);
1850     src0_h = __lasx_xvmul_h(wgt, src0_h);
1851     src1_l = __lasx_xvmul_h(wgt, src1_l);
1852     src1_h = __lasx_xvmul_h(wgt, src1_h);
1853     src2_l = __lasx_xvmul_h(wgt, src2_l);
1854     src2_h = __lasx_xvmul_h(wgt, src2_h);
1855     src3_l = __lasx_xvmul_h(wgt, src3_l);
1856     src3_h = __lasx_xvmul_h(wgt, src3_h);
1857 
1858     DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1859               src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1860     DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1861               src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1862 
1863     src0_l = __lasx_xvmaxi_h(src0_l, 0);
1864     src0_h = __lasx_xvmaxi_h(src0_h, 0);
1865     src1_l = __lasx_xvmaxi_h(src1_l, 0);
1866     src1_h = __lasx_xvmaxi_h(src1_h, 0);
1867     src2_l = __lasx_xvmaxi_h(src2_l, 0);
1868     src2_h = __lasx_xvmaxi_h(src2_h, 0);
1869     src3_l = __lasx_xvmaxi_h(src3_l, 0);
1870     src3_h = __lasx_xvmaxi_h(src3_h, 0);
1871     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1872     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1873     src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1874     src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1875     src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1876     src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1877     src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1878     src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1879     DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
1880               src3_h, src3_l, src0, src1, src2, src3);
1881 
1882     __lasx_xvstelm_d(src0, src, 0, 0);
1883     __lasx_xvstelm_d(src0, src + stride, 0, 1);
1884     __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1885     __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1886     src += stride_4x;
1887     __lasx_xvstelm_d(src1, src, 0, 0);
1888     __lasx_xvstelm_d(src1, src + stride, 0, 1);
1889     __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1890     __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1891     src += stride_4x;
1892     __lasx_xvstelm_d(src2, src, 0, 0);
1893     __lasx_xvstelm_d(src2, src + stride, 0, 1);
1894     __lasx_xvstelm_d(src2, src + stride_2x, 0, 2);
1895     __lasx_xvstelm_d(src2, src + stride_3x, 0, 3);
1896     src += stride_4x;
1897     __lasx_xvstelm_d(src3, src, 0, 0);
1898     __lasx_xvstelm_d(src3, src + stride, 0, 1);
1899     __lasx_xvstelm_d(src3, src + stride_2x, 0, 2);
1900     __lasx_xvstelm_d(src3, src + stride_3x, 0, 3);
1901 }
1902 
ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)1903 void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
1904                                    int height, int log2_denom,
1905                                    int weight_src, int offset)
1906 {
1907     if (4 == height) {
1908         avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset);
1909     } else if (8 == height) {
1910         avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset);
1911     } else {
1912         avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset);
1913     }
1914 }
1915 
avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)1916 static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride,
1917                              int32_t log2_denom, int32_t weight_src,
1918                              int32_t offset_in)
1919 {
1920     uint32_t offset_val;
1921     __m256i wgt, zero = __lasx_xvldi(0);
1922     __m256i src0, tmp0, tmp1, denom, offset;
1923 
1924     offset_val = (unsigned) offset_in << log2_denom;
1925 
1926     wgt    = __lasx_xvreplgr2vr_h(weight_src);
1927     offset = __lasx_xvreplgr2vr_h(offset_val);
1928     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1929 
1930     DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1931     src0 = __lasx_xvilvl_w(tmp1, tmp0);
1932     src0 = __lasx_xvilvl_b(zero, src0);
1933     src0 = __lasx_xvmul_h(wgt, src0);
1934     src0 = __lasx_xvsadd_h(src0, offset);
1935     src0 = __lasx_xvmaxi_h(src0, 0);
1936     src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1937     __lasx_xvstelm_w(src0, src, 0, 0);
1938     __lasx_xvstelm_w(src0, src + stride, 0, 1);
1939 }
1940 
avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)1941 static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride,
1942                              int32_t log2_denom, int32_t weight_src,
1943                              int32_t offset_in)
1944 {
1945     __m256i wgt;
1946     __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset;
1947     uint32_t offset_val;
1948     ptrdiff_t stride_2x = stride << 1;
1949     ptrdiff_t stride_3x = stride_2x + stride;
1950 
1951     offset_val = (unsigned) offset_in << log2_denom;
1952 
1953     wgt    = __lasx_xvreplgr2vr_h(weight_src);
1954     offset = __lasx_xvreplgr2vr_h(offset_val);
1955     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1956 
1957     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1958               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1959     DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1960     src0 = __lasx_xvilvl_w(tmp1, tmp0);
1961     src0 = __lasx_vext2xv_hu_bu(src0);
1962     src0 = __lasx_xvmul_h(wgt, src0);
1963     src0 = __lasx_xvsadd_h(src0, offset);
1964     src0 = __lasx_xvmaxi_h(src0, 0);
1965     src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1966     __lasx_xvstelm_w(src0, src, 0, 0);
1967     __lasx_xvstelm_w(src0, src + stride, 0, 1);
1968     __lasx_xvstelm_w(src0, src + stride_2x, 0, 4);
1969     __lasx_xvstelm_w(src0, src + stride_3x, 0, 5);
1970 }
1971 
avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)1972 static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride,
1973                              int32_t log2_denom, int32_t weight_src,
1974                              int32_t offset_in)
1975 {
1976     __m256i src0, src0_h, src0_l;
1977     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1978     __m256i wgt, zero = __lasx_xvldi(0);
1979     uint32_t offset_val;
1980     ptrdiff_t stride_2x = stride << 1;
1981     ptrdiff_t stride_4x = stride << 2;
1982     ptrdiff_t stride_3x = stride_2x + stride;
1983 
1984     offset_val = (unsigned) offset_in << log2_denom;
1985 
1986     wgt    = __lasx_xvreplgr2vr_h(weight_src);
1987     offset = __lasx_xvreplgr2vr_h(offset_val);
1988     denom  = __lasx_xvreplgr2vr_h(log2_denom);
1989 
1990     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1991               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1992     src += stride_4x;
1993     DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1994               src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1995     src -= stride_4x;
1996     DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
1997               tmp5, tmp0, tmp1, tmp2, tmp3);
1998     DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1999     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
2000     src0_l = __lasx_xvilvl_b(zero, src0);
2001     src0_h = __lasx_xvilvh_b(zero, src0);
2002     src0_l = __lasx_xvmul_h(wgt, src0_l);
2003     src0_h = __lasx_xvmul_h(wgt, src0_h);
2004     src0_l = __lasx_xvsadd_h(src0_l, offset);
2005     src0_h = __lasx_xvsadd_h(src0_h, offset);
2006     src0_l = __lasx_xvmaxi_h(src0_l, 0);
2007     src0_h = __lasx_xvmaxi_h(src0_h, 0);
2008     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
2009     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
2010     __lasx_xvstelm_w(src0_l, src, 0, 0);
2011     __lasx_xvstelm_w(src0_l, src + stride, 0, 1);
2012     __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0);
2013     __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1);
2014     src += stride_4x;
2015     __lasx_xvstelm_w(src0_l, src, 0, 4);
2016     __lasx_xvstelm_w(src0_l, src + stride, 0, 5);
2017     __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4);
2018     __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5);
2019 }
2020 
ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)2021 void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
2022                                    int height, int log2_denom,
2023                                    int weight_src, int offset)
2024 {
2025     if (2 == height) {
2026         avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset);
2027     } else if (4 == height) {
2028         avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset);
2029     } else {
2030         avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset);
2031     }
2032 }
2033 
ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)2034 void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2035 {
2036     __m256i src0, dst0, dst1, dst2, dst3, zero;
2037     __m256i tmp0, tmp1;
2038     uint8_t* _dst1 = _dst + stride;
2039     uint8_t* _dst2 = _dst1 + stride;
2040     uint8_t* _dst3 = _dst2 + stride;
2041 
2042     src0 = __lasx_xvld(_src, 0);
2043     dst0 = __lasx_xvldrepl_w(_dst, 0);
2044     dst1 = __lasx_xvldrepl_w(_dst1, 0);
2045     dst2 = __lasx_xvldrepl_w(_dst2, 0);
2046     dst3 = __lasx_xvldrepl_w(_dst3, 0);
2047     tmp0 = __lasx_xvilvl_w(dst1, dst0);
2048     tmp1 = __lasx_xvilvl_w(dst3, dst2);
2049     dst0 = __lasx_xvilvl_d(tmp1, tmp0);
2050     tmp0 = __lasx_vext2xv_hu_bu(dst0);
2051     zero = __lasx_xvldi(0);
2052     tmp1 = __lasx_xvadd_h(src0, tmp0);
2053     dst0 = __lasx_xvpickev_b(tmp1, tmp1);
2054     __lasx_xvstelm_w(dst0, _dst, 0, 0);
2055     __lasx_xvstelm_w(dst0, _dst1, 0, 1);
2056     __lasx_xvstelm_w(dst0, _dst2, 0, 4);
2057     __lasx_xvstelm_w(dst0, _dst3, 0, 5);
2058     __lasx_xvst(zero, _src, 0);
2059 }
2060 
ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)2061 void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2062 {
2063     __m256i src0, src1, src2, src3;
2064     __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2065     __m256i tmp0, tmp1, tmp2, tmp3;
2066     __m256i zero = __lasx_xvldi(0);
2067     uint8_t *_dst1 = _dst + stride;
2068     uint8_t *_dst2 = _dst1 + stride;
2069     uint8_t *_dst3 = _dst2 + stride;
2070     uint8_t *_dst4 = _dst3 + stride;
2071     uint8_t *_dst5 = _dst4 + stride;
2072     uint8_t *_dst6 = _dst5 + stride;
2073     uint8_t *_dst7 = _dst6 + stride;
2074 
2075     src0 = __lasx_xvld(_src, 0);
2076     src1 = __lasx_xvld(_src, 32);
2077     src2 = __lasx_xvld(_src, 64);
2078     src3 = __lasx_xvld(_src, 96);
2079     dst0 = __lasx_xvldrepl_d(_dst, 0);
2080     dst1 = __lasx_xvldrepl_d(_dst1, 0);
2081     dst2 = __lasx_xvldrepl_d(_dst2, 0);
2082     dst3 = __lasx_xvldrepl_d(_dst3, 0);
2083     dst4 = __lasx_xvldrepl_d(_dst4, 0);
2084     dst5 = __lasx_xvldrepl_d(_dst5, 0);
2085     dst6 = __lasx_xvldrepl_d(_dst6, 0);
2086     dst7 = __lasx_xvldrepl_d(_dst7, 0);
2087     tmp0 = __lasx_xvilvl_d(dst1, dst0);
2088     tmp1 = __lasx_xvilvl_d(dst3, dst2);
2089     tmp2 = __lasx_xvilvl_d(dst5, dst4);
2090     tmp3 = __lasx_xvilvl_d(dst7, dst6);
2091     dst0 = __lasx_vext2xv_hu_bu(tmp0);
2092     dst1 = __lasx_vext2xv_hu_bu(tmp1);
2093     dst1 = __lasx_vext2xv_hu_bu(tmp1);
2094     dst2 = __lasx_vext2xv_hu_bu(tmp2);
2095     dst3 = __lasx_vext2xv_hu_bu(tmp3);
2096     tmp0 = __lasx_xvadd_h(src0, dst0);
2097     tmp1 = __lasx_xvadd_h(src1, dst1);
2098     tmp2 = __lasx_xvadd_h(src2, dst2);
2099     tmp3 = __lasx_xvadd_h(src3, dst3);
2100     dst1 = __lasx_xvpickev_b(tmp1, tmp0);
2101     dst2 = __lasx_xvpickev_b(tmp3, tmp2);
2102     __lasx_xvst(zero, _src, 0);
2103     __lasx_xvst(zero, _src, 32);
2104     __lasx_xvst(zero, _src, 64);
2105     __lasx_xvst(zero, _src, 96);
2106     __lasx_xvstelm_d(dst1, _dst, 0, 0);
2107     __lasx_xvstelm_d(dst1, _dst1, 0, 2);
2108     __lasx_xvstelm_d(dst1, _dst2, 0, 1);
2109     __lasx_xvstelm_d(dst1, _dst3, 0, 3);
2110     __lasx_xvstelm_d(dst2, _dst4, 0, 0);
2111     __lasx_xvstelm_d(dst2, _dst5, 0, 2);
2112     __lasx_xvstelm_d(dst2, _dst6, 0, 1);
2113     __lasx_xvstelm_d(dst2, _dst7, 0, 3);
2114 }
2115