1/*
2 * Loongson LASX optimized h264dsp
3 *
4 * Copyright (c) 2021 Loongson Technology Corporation Limited
5 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6 *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include "libavutil/loongarch/loongson_intrinsics.h"
26#include "h264dsp_lasx.h"
27
28#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,   \
29                         p1_or_q1_org_in, p2_or_q2_org_in,   \
30                         neg_tc_in, tc_in, p1_or_q1_out)     \
31{                                                            \
32    __m256i clip3, temp;                                     \
33                                                             \
34    clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in,                \
35                             q0_or_p0_org_in);               \
36    temp = __lasx_xvslli_h(p1_or_q1_org_in, 1);              \
37    clip3 = __lasx_xvsub_h(clip3, temp);                     \
38    clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3);          \
39    clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in);        \
40    p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3);   \
41}
42
43#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,       \
44                     p1_or_q1_org_in, q1_or_p1_org_in,       \
45                     neg_threshold_in, threshold_in,         \
46                     p0_or_q0_out, q0_or_p0_out)             \
47{                                                            \
48    __m256i q0_sub_p0, p1_sub_q1, delta;                     \
49                                                             \
50    q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in,              \
51                               p0_or_q0_org_in);             \
52    p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in,              \
53                               q1_or_p1_org_in);             \
54    q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2);               \
55    p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4);              \
56    delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1);            \
57    delta = __lasx_xvsrai_h(delta, 3);                       \
58    delta = __lasx_xvclip_h(delta, neg_threshold_in,         \
59           threshold_in);                                    \
60    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta);   \
61    q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta);   \
62                                                             \
63    p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out);         \
64    q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out);         \
65}
66
67void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
68                               int alpha_in, int beta_in, int8_t *tc)
69{
70    ptrdiff_t img_width_2x = img_width << 1;
71    ptrdiff_t img_width_4x = img_width << 2;
72    ptrdiff_t img_width_8x = img_width << 3;
73    ptrdiff_t img_width_3x = img_width_2x + img_width;
74    __m256i tmp_vec0, bs_vec;
75    __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
76                      0x0101010100000000, 0x0303030302020202};
77
78    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
79    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
80    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
81    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
82    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
83
84    if (__lasx_xbnz_v(bs_vec)) {
85        uint8_t *src = data - 4;
86        __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
87        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
88        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
89        __m256i is_bs_greater_than0;
90        __m256i zero = __lasx_xvldi(0);
91
92        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
93
94        {
95            uint8_t *src_tmp = src + img_width_8x;
96            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
97            __m256i row8, row9, row10, row11, row12, row13, row14, row15;
98
99            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
100                      src, img_width_3x, row0, row1, row2, row3);
101            src += img_width_4x;
102            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
103                      src, img_width_3x, row4, row5, row6, row7);
104            src -= img_width_4x;
105            DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
106                      img_width_2x, src_tmp, img_width_3x,
107                      row8, row9, row10, row11);
108            src_tmp += img_width_4x;
109            DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
110                      img_width_2x, src_tmp, img_width_3x,
111                      row12, row13, row14, row15);
112            src_tmp -= img_width_4x;
113
114            LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
115                                 row7, row8, row9, row10, row11,
116                                 row12, row13, row14, row15,
117                                 p3_org, p2_org, p1_org, p0_org,
118                                 q0_org, q1_org, q2_org, q3_org);
119        }
120
121        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
122        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
123        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
124
125        alpha = __lasx_xvreplgr2vr_b(alpha_in);
126        beta  = __lasx_xvreplgr2vr_b(beta_in);
127
128        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
129        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
130        is_less_than       = is_less_than_alpha & is_less_than_beta;
131        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
132        is_less_than       = is_less_than_beta & is_less_than;
133        is_less_than       = is_less_than & is_bs_greater_than0;
134
135        if (__lasx_xbnz_v(is_less_than)) {
136            __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
137            __m256i p2_asub_p0, q2_asub_q0;
138
139            neg_tc_h = __lasx_xvneg_b(tc_vec);
140            neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
141            tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
142            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
143            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
144            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
145
146            p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
147            is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
148            is_less_than_beta = is_less_than_beta & is_less_than;
149
150            if (__lasx_xbnz_v(is_less_than_beta)) {
151                __m256i p2_org_h, p1_h;
152
153                p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
154                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
155                                 neg_tc_h, tc_h, p1_h);
156                p1_h = __lasx_xvpickev_b(p1_h, p1_h);
157                p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
158                p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
159                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
160                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
161            }
162
163            q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
164            is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
165            is_less_than_beta = is_less_than_beta & is_less_than;
166
167            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
168
169            if (__lasx_xbnz_v(is_less_than_beta)) {
170                __m256i q2_org_h, q1_h;
171
172                q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
173                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
174                                 neg_tc_h, tc_h, q1_h);
175                q1_h = __lasx_xvpickev_b(q1_h, q1_h);
176                q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
177                q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
178
179                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
180                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
181            }
182
183            {
184                __m256i neg_thresh_h, p0_h, q0_h;
185
186                neg_thresh_h = __lasx_xvneg_b(tc_vec);
187                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
188                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
189
190                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
191                             neg_thresh_h, tc_h, p0_h, q0_h);
192                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
193                          p0_h, q0_h);
194                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
195                          p0_h, q0_h);
196                p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
197                q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
198            }
199
200            {
201                __m256i row0, row1, row2, row3, row4, row5, row6, row7;
202                __m256i control = {0x0000000400000000, 0x0000000500000001,
203                                   0x0000000600000002, 0x0000000700000003};
204
205                DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
206                          q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
207                          q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
208                DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
209                          row0, row2);
210                DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
211                          row1, row3);
212                DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
213                DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
214                DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
215                          control, row7, control, row4, row5, row6, row7);
216                __lasx_xvstelm_d(row4, src, 0, 0);
217                __lasx_xvstelm_d(row4, src + img_width, 0, 1);
218                src += img_width_2x;
219                __lasx_xvstelm_d(row4, src, 0, 2);
220                __lasx_xvstelm_d(row4, src + img_width, 0, 3);
221                src += img_width_2x;
222                __lasx_xvstelm_d(row5, src, 0, 0);
223                __lasx_xvstelm_d(row5, src + img_width, 0, 1);
224                src += img_width_2x;
225                __lasx_xvstelm_d(row5, src, 0, 2);
226                __lasx_xvstelm_d(row5, src + img_width, 0, 3);
227                src += img_width_2x;
228                __lasx_xvstelm_d(row6, src, 0, 0);
229                __lasx_xvstelm_d(row6, src + img_width, 0, 1);
230                src += img_width_2x;
231                __lasx_xvstelm_d(row6, src, 0, 2);
232                __lasx_xvstelm_d(row6, src + img_width, 0, 3);
233                src += img_width_2x;
234                __lasx_xvstelm_d(row7, src, 0, 0);
235                __lasx_xvstelm_d(row7, src + img_width, 0, 1);
236                src += img_width_2x;
237                __lasx_xvstelm_d(row7, src, 0, 2);
238                __lasx_xvstelm_d(row7, src + img_width, 0, 3);
239            }
240        }
241    }
242}
243
244void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
245                                   int alpha_in, int beta_in, int8_t *tc)
246{
247    ptrdiff_t img_width_2x = img_width << 1;
248    ptrdiff_t img_width_3x = img_width + img_width_2x;
249    __m256i tmp_vec0, bs_vec;
250    __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
251                      0x0101010100000000, 0x0303030302020202};
252
253    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
254    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
255    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
256    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
257    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
258
259    if (__lasx_xbnz_v(bs_vec)) {
260        __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
261        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
262        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
263        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
264        __m256i is_bs_greater_than0;
265        __m256i zero = __lasx_xvldi(0);
266
267        alpha = __lasx_xvreplgr2vr_b(alpha_in);
268        beta  = __lasx_xvreplgr2vr_b(beta_in);
269
270        DUP2_ARG2(__lasx_xvldx, data, -img_width_3x, data, -img_width_2x,
271                  p2_org, p1_org);
272        p0_org = __lasx_xvldx(data, -img_width);
273        DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
274
275        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
276        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
277        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
278        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
279
280        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
281        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
282        is_less_than       = is_less_than_alpha & is_less_than_beta;
283        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
284        is_less_than       = is_less_than_beta & is_less_than;
285        is_less_than       = is_less_than & is_bs_greater_than0;
286
287        if (__lasx_xbnz_v(is_less_than)) {
288            __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
289
290            q2_org = __lasx_xvldx(data, img_width_2x);
291
292            neg_tc_h = __lasx_xvneg_b(tc_vec);
293            neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
294            tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
295            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
296            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
297            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
298
299            p2_asub_p0        = __lasx_xvabsd_bu(p2_org, p0_org);
300            is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
301            is_less_than_beta = is_less_than_beta & is_less_than;
302
303            if (__lasx_xbnz_v(is_less_than_beta)) {
304                __m256i p1_h, p2_org_h;
305
306                p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
307                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
308                                 neg_tc_h, tc_h, p1_h);
309                p1_h = __lasx_xvpickev_b(p1_h, p1_h);
310                p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
311                p1_h   = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
312                p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
313                __lasx_xvst(p1_org, data - img_width_2x, 0);
314
315                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
316                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
317            }
318
319            q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
320            is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
321            is_less_than_beta = is_less_than_beta & is_less_than;
322
323            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
324
325            if (__lasx_xbnz_v(is_less_than_beta)) {
326                __m256i q1_h, q2_org_h;
327
328                q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
329                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
330                                 neg_tc_h, tc_h, q1_h);
331                q1_h = __lasx_xvpickev_b(q1_h, q1_h);
332                q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
333                q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
334                q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
335                __lasx_xvst(q1_org, data + img_width, 0);
336
337                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
338                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
339
340            }
341
342            {
343                __m256i neg_thresh_h, p0_h, q0_h;
344
345                neg_thresh_h = __lasx_xvneg_b(tc_vec);
346                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
347                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
348
349                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
350                             neg_thresh_h, tc_h, p0_h, q0_h);
351                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
352                          p0_h, q0_h);
353                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
354                          p0_h, q0_h);
355                p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
356                q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
357                p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
358                q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
359                __lasx_xvst(p0_org, data - img_width, 0);
360                __lasx_xvst(q0_org, data, 0);
361            }
362        }
363    }
364}
365
366void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
367                                 int alpha_in, int beta_in, int8_t *tc)
368{
369    __m256i tmp_vec0, bs_vec;
370    __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
371    __m256i zero = __lasx_xvldi(0);
372    ptrdiff_t img_width_2x = img_width << 1;
373    ptrdiff_t img_width_4x = img_width << 2;
374    ptrdiff_t img_width_3x = img_width_2x + img_width;
375
376    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
377    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
378    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
379    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
380    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
381    bs_vec   = __lasx_xvpermi_q(zero, bs_vec, 0x30);
382
383    if (__lasx_xbnz_v(bs_vec)) {
384        uint8_t *src = data - 2;
385        __m256i p1_org, p0_org, q0_org, q1_org;
386        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
387        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
388        __m256i is_bs_greater_than0;
389
390        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
391
392        {
393            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
394
395            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
396                      src, img_width_3x, row0, row1, row2, row3);
397            src += img_width_4x;
398            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
399                      src, img_width_3x, row4, row5, row6, row7);
400            src -= img_width_4x;
401            /* LASX_TRANSPOSE8x4_B */
402            DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
403                      row7, row5, p1_org, p0_org, q0_org, q1_org);
404            row0 = __lasx_xvilvl_b(p0_org, p1_org);
405            row1 = __lasx_xvilvl_b(q1_org, q0_org);
406            row3 = __lasx_xvilvh_w(row1, row0);
407            row2 = __lasx_xvilvl_w(row1, row0);
408            p1_org = __lasx_xvpermi_d(row2, 0x00);
409            p0_org = __lasx_xvpermi_d(row2, 0x55);
410            q0_org = __lasx_xvpermi_d(row3, 0x00);
411            q1_org = __lasx_xvpermi_d(row3, 0x55);
412        }
413
414        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
415        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
416        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
417
418        alpha = __lasx_xvreplgr2vr_b(alpha_in);
419        beta  = __lasx_xvreplgr2vr_b(beta_in);
420
421        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
422        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
423        is_less_than       = is_less_than_alpha & is_less_than_beta;
424        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
425        is_less_than       = is_less_than_beta & is_less_than;
426        is_less_than       = is_less_than & is_bs_greater_than0;
427
428        if (__lasx_xbnz_v(is_less_than)) {
429            __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
430
431            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
432            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
433            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
434            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
435
436            {
437                __m256i tc_h, neg_thresh_h, p0_h, q0_h;
438
439                neg_thresh_h = __lasx_xvneg_b(tc_vec);
440                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
441                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
442
443                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
444                             neg_thresh_h, tc_h, p0_h, q0_h);
445                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
446                          p0_h, q0_h);
447                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
448                          p0_h, q0_h);
449                p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
450                q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
451            }
452
453            p0_org = __lasx_xvilvl_b(q0_org, p0_org);
454            src = data - 1;
455            __lasx_xvstelm_h(p0_org, src, 0, 0);
456            src += img_width;
457            __lasx_xvstelm_h(p0_org, src, 0, 1);
458            src += img_width;
459            __lasx_xvstelm_h(p0_org, src, 0, 2);
460            src += img_width;
461            __lasx_xvstelm_h(p0_org, src, 0, 3);
462            src += img_width;
463            __lasx_xvstelm_h(p0_org, src, 0, 4);
464            src += img_width;
465            __lasx_xvstelm_h(p0_org, src, 0, 5);
466            src += img_width;
467            __lasx_xvstelm_h(p0_org, src, 0, 6);
468            src += img_width;
469            __lasx_xvstelm_h(p0_org, src, 0, 7);
470        }
471    }
472}
473
474void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
475                                 int alpha_in, int beta_in, int8_t *tc)
476{
477    int img_width_2x = img_width << 1;
478    __m256i tmp_vec0, bs_vec;
479    __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
480    __m256i zero = __lasx_xvldi(0);
481
482    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
483    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
484    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
485    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
486    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
487    bs_vec   = __lasx_xvpermi_q(zero, bs_vec, 0x30);
488
489    if (__lasx_xbnz_v(bs_vec)) {
490        __m256i p1_org, p0_org, q0_org, q1_org;
491        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
492        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
493        __m256i is_bs_greater_than0;
494
495        alpha = __lasx_xvreplgr2vr_b(alpha_in);
496        beta  = __lasx_xvreplgr2vr_b(beta_in);
497
498        DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width,
499                  p1_org, p0_org);
500        DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
501
502        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
503        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
504        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
505        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
506
507        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
508        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
509        is_less_than       = is_less_than_alpha & is_less_than_beta;
510        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
511        is_less_than       = is_less_than_beta & is_less_than;
512        is_less_than       = is_less_than & is_bs_greater_than0;
513
514        if (__lasx_xbnz_v(is_less_than)) {
515            __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
516
517            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
518            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
519            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
520            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
521
522            {
523                __m256i neg_thresh_h, tc_h, p0_h, q0_h;
524
525                neg_thresh_h = __lasx_xvneg_b(tc_vec);
526                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
527                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
528
529                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
530                             neg_thresh_h, tc_h, p0_h, q0_h);
531                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
532                          p0_h, q0_h);
533                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
534                          p0_h, q0_h);
535                p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
536                q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
537                __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
538                __lasx_xvstelm_d(q0_h, data, 0, 0);
539            }
540        }
541    }
542}
543
544#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
545                                 q3_or_p3_org_in, p1_or_q1_org_in,          \
546                                 p2_or_q2_org_in, q1_or_p1_org_in,          \
547                                 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
548{                                                                           \
549    __m256i threshold;                                                      \
550    __m256i const2, const3 = __lasx_xvldi(0);                               \
551                                                                            \
552    const2 = __lasx_xvaddi_hu(const3, 2);                                   \
553    const3 = __lasx_xvaddi_hu(const3, 3);                                   \
554    threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in);           \
555    threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold);                 \
556                                                                            \
557    p0_or_q0_out = __lasx_xvslli_h(threshold, 1);                           \
558    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in);           \
559    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in);           \
560    p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3);                   \
561                                                                            \
562    p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold);              \
563    p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2);                   \
564                                                                            \
565    p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3);                 \
566    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \
567    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \
568    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold);                 \
569    p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3);                   \
570}
571
572/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
573#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,             \
574                         p1_or_q1_org_in, p0_or_q0_out)                \
575{                                                                      \
576    __m256i const2 = __lasx_xvldi(0);                                  \
577    const2 = __lasx_xvaddi_hu(const2, 2);                              \
578    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in);   \
579    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \
580    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \
581    p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2);              \
582}
583
584void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
585                                     int alpha_in, int beta_in)
586{
587    ptrdiff_t img_width_2x = img_width << 1;
588    ptrdiff_t img_width_4x = img_width << 2;
589    ptrdiff_t img_width_3x = img_width_2x + img_width;
590    uint8_t *src = data - 4;
591    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
592    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
593    __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
594    __m256i zero = __lasx_xvldi(0);
595
596    {
597        __m256i row0, row1, row2, row3, row4, row5, row6, row7;
598        __m256i row8, row9, row10, row11, row12, row13, row14, row15;
599
600        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
601                  src, img_width_3x, row0, row1, row2, row3);
602        src += img_width_4x;
603        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
604                  src, img_width_3x, row4, row5, row6, row7);
605        src += img_width_4x;
606        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
607                  src, img_width_3x, row8, row9, row10, row11);
608        src += img_width_4x;
609        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
610                  src, img_width_3x, row12, row13, row14, row15);
611        src += img_width_4x;
612
613        LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
614                             row4, row5, row6, row7,
615                             row8, row9, row10, row11,
616                             row12, row13, row14, row15,
617                             p3_org, p2_org, p1_org, p0_org,
618                             q0_org, q1_org, q2_org, q3_org);
619    }
620
621    alpha = __lasx_xvreplgr2vr_b(alpha_in);
622    beta  = __lasx_xvreplgr2vr_b(beta_in);
623    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
624    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
625    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
626
627    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
628    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
629    is_less_than       = is_less_than_beta & is_less_than_alpha;
630    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
631    is_less_than       = is_less_than_beta & is_less_than;
632    is_less_than       = __lasx_xvpermi_q(zero, is_less_than, 0x30);
633
634    if (__lasx_xbnz_v(is_less_than)) {
635        __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
636        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
637        __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
638
639        less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
640        less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
641                                                 less_alpha_shift2_add2);
642
643        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
644        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
645        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
646        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
647
648        p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
649        is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
650        is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
651        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
652        is_less_than_beta        = is_less_than_beta & is_less_than;
653        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
654
655        /* combine and store */
656        if (__lasx_xbnz_v(is_less_than_beta)) {
657            __m256i p2_org_h, p3_org_h, p1_h, p2_h;
658
659            p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
660            p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
661
662            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
663                                     p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
664
665            p0_h = __lasx_xvpickev_b(p0_h, p0_h);
666            p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
667            DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
668            DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
669            p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
670            p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
671            p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
672        }
673
674        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
675        /* combine */
676        p0_h = __lasx_xvpickev_b(p0_h, p0_h);
677        p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
678        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
679
680        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
681        q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
682        is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
683        is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
684        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
685        is_less_than_beta = is_less_than_beta & is_less_than;
686        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
687
688        /* combine and store */
689        if (__lasx_xbnz_v(is_less_than_beta)) {
690            __m256i q2_org_h, q3_org_h, q1_h, q2_h;
691
692            q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
693            q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
694
695            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
696                                     q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
697
698            q0_h = __lasx_xvpickev_b(q0_h, q0_h);
699            q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
700            DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
701            DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
702            q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
703            q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
704            q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
705
706        }
707
708        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
709
710        /* combine */
711        q0_h = __lasx_xvpickev_b(q0_h, q0_h);
712        q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
713        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
714
715        /* transpose and store */
716        {
717            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
718            __m256i control = {0x0000000400000000, 0x0000000500000001,
719                               0x0000000600000002, 0x0000000700000003};
720
721            DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
722                      0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
723                      p0_org, p1_org, p2_org, p3_org);
724            DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
725                      row0, row2);
726            DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
727                      row1, row3);
728            DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
729            DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
730            DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
731                      control, row7, control, row4, row5, row6, row7);
732            src = data - 4;
733            __lasx_xvstelm_d(row4, src, 0, 0);
734            __lasx_xvstelm_d(row4, src + img_width, 0, 1);
735            src += img_width_2x;
736            __lasx_xvstelm_d(row4, src, 0, 2);
737            __lasx_xvstelm_d(row4, src + img_width, 0, 3);
738            src += img_width_2x;
739            __lasx_xvstelm_d(row5, src, 0, 0);
740            __lasx_xvstelm_d(row5, src + img_width, 0, 1);
741            src += img_width_2x;
742            __lasx_xvstelm_d(row5, src, 0, 2);
743            __lasx_xvstelm_d(row5, src + img_width, 0, 3);
744            src += img_width_2x;
745            __lasx_xvstelm_d(row6, src, 0, 0);
746            __lasx_xvstelm_d(row6, src + img_width, 0, 1);
747            src += img_width_2x;
748            __lasx_xvstelm_d(row6, src, 0, 2);
749            __lasx_xvstelm_d(row6, src + img_width, 0, 3);
750            src += img_width_2x;
751            __lasx_xvstelm_d(row7, src, 0, 0);
752            __lasx_xvstelm_d(row7, src + img_width, 0, 1);
753            src += img_width_2x;
754            __lasx_xvstelm_d(row7, src, 0, 2);
755            __lasx_xvstelm_d(row7, src + img_width, 0, 3);
756        }
757    }
758}
759
760void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
761                                     int alpha_in, int beta_in)
762{
763    ptrdiff_t img_width_2x = img_width << 1;
764    ptrdiff_t img_width_3x = img_width_2x + img_width;
765    uint8_t *src = data - img_width_2x;
766    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
767    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
768    __m256i p1_org, p0_org, q0_org, q1_org;
769    __m256i zero = __lasx_xvldi(0);
770
771    DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
772              src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
773    alpha = __lasx_xvreplgr2vr_b(alpha_in);
774    beta  = __lasx_xvreplgr2vr_b(beta_in);
775    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
776    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
777    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
778
779    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
780    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
781    is_less_than       = is_less_than_beta & is_less_than_alpha;
782    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
783    is_less_than       = is_less_than_beta & is_less_than;
784    is_less_than       = __lasx_xvpermi_q(zero, is_less_than, 0x30);
785
786    if (__lasx_xbnz_v(is_less_than)) {
787        __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
788        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
789        __m256i p2_org = __lasx_xvldx(src, -img_width);
790        __m256i q2_org = __lasx_xvldx(data, img_width_2x);
791        __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
792        less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
793        less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
794                                                 less_alpha_shift2_add2);
795
796        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
797        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
798        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
799        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
800
801        p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
802        is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
803        is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
804        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
805        is_less_than_beta        = is_less_than_beta & is_less_than;
806        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
807
808        /* combine and store */
809        if (__lasx_xbnz_v(is_less_than_beta)) {
810            __m256i p2_org_h, p3_org_h, p1_h, p2_h;
811            __m256i p3_org = __lasx_xvldx(src, -img_width_2x);
812
813            p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
814            p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
815
816            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
817                                     p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
818
819            p0_h = __lasx_xvpickev_b(p0_h, p0_h);
820            p0_h =  __lasx_xvpermi_d(p0_h, 0xd8);
821            DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
822            DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
823            p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
824            p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
825            p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
826
827            __lasx_xvst(p1_org, src, 0);
828            __lasx_xvst(p2_org, src - img_width, 0);
829        }
830
831        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
832        /* combine */
833        p0_h = __lasx_xvpickev_b(p0_h, p0_h);
834        p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
835        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
836        __lasx_xvst(p0_org, data - img_width, 0);
837
838        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
839        q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
840        is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
841        is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
842        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
843        is_less_than_beta = is_less_than_beta & is_less_than;
844        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
845
846        /* combine and store */
847        if (__lasx_xbnz_v(is_less_than_beta)) {
848            __m256i q2_org_h, q3_org_h, q1_h, q2_h;
849            __m256i q3_org = __lasx_xvldx(data, img_width_2x + img_width);
850
851            q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
852            q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
853
854            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
855                                     q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
856
857            q0_h = __lasx_xvpickev_b(q0_h, q0_h);
858            q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
859            DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
860            DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
861            q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
862            q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
863            q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
864
865            __lasx_xvst(q1_org, data + img_width, 0);
866            __lasx_xvst(q2_org, data + img_width_2x, 0);
867        }
868
869        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
870
871        /* combine */
872        q0_h = __lasx_xvpickev_b(q0_h, q0_h);
873        q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
874        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
875
876        __lasx_xvst(q0_org, data, 0);
877    }
878}
879
880void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
881                                       int alpha_in, int beta_in)
882{
883    uint8_t *src = data - 2;
884    ptrdiff_t img_width_2x = img_width << 1;
885    ptrdiff_t img_width_4x = img_width << 2;
886    ptrdiff_t img_width_3x = img_width_2x + img_width;
887    __m256i p1_org, p0_org, q0_org, q1_org;
888    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
889    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
890
891    {
892        __m256i row0, row1, row2, row3, row4, row5, row6, row7;
893
894        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
895                  img_width_3x, row0, row1, row2, row3);
896        src += img_width_4x;
897        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
898                  img_width_3x, row4, row5, row6, row7);
899
900        /* LASX_TRANSPOSE8x4_B */
901        DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
902                  p1_org, p0_org, q0_org, q1_org);
903        row0 = __lasx_xvilvl_b(p0_org, p1_org);
904        row1 = __lasx_xvilvl_b(q1_org, q0_org);
905        row3 = __lasx_xvilvh_w(row1, row0);
906        row2 = __lasx_xvilvl_w(row1, row0);
907        p1_org = __lasx_xvpermi_d(row2, 0x00);
908        p0_org = __lasx_xvpermi_d(row2, 0x55);
909        q0_org = __lasx_xvpermi_d(row3, 0x00);
910        q1_org = __lasx_xvpermi_d(row3, 0x55);
911    }
912
913    alpha = __lasx_xvreplgr2vr_b(alpha_in);
914    beta  = __lasx_xvreplgr2vr_b(beta_in);
915
916    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
917    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
918    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
919
920    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
921    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
922    is_less_than       = is_less_than_alpha & is_less_than_beta;
923    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
924    is_less_than       = is_less_than_beta & is_less_than;
925
926    if (__lasx_xbnz_v(is_less_than)) {
927        __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
928
929        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
930        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
931        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
932        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
933
934        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
935        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
936        DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
937        DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
938        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
939        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
940    }
941    p0_org = __lasx_xvilvl_b(q0_org, p0_org);
942    src = data - 1;
943    __lasx_xvstelm_h(p0_org, src, 0, 0);
944    src += img_width;
945    __lasx_xvstelm_h(p0_org, src, 0, 1);
946    src += img_width;
947    __lasx_xvstelm_h(p0_org, src, 0, 2);
948    src += img_width;
949    __lasx_xvstelm_h(p0_org, src, 0, 3);
950    src += img_width;
951    __lasx_xvstelm_h(p0_org, src, 0, 4);
952    src += img_width;
953    __lasx_xvstelm_h(p0_org, src, 0, 5);
954    src += img_width;
955    __lasx_xvstelm_h(p0_org, src, 0, 6);
956    src += img_width;
957    __lasx_xvstelm_h(p0_org, src, 0, 7);
958}
959
960void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
961                                       int alpha_in, int beta_in)
962{
963    ptrdiff_t img_width_2x = img_width << 1;
964    __m256i p1_org, p0_org, q0_org, q1_org;
965    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
966    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
967
968    alpha = __lasx_xvreplgr2vr_b(alpha_in);
969    beta  = __lasx_xvreplgr2vr_b(beta_in);
970
971    p1_org = __lasx_xvldx(data, -img_width_2x);
972    p0_org = __lasx_xvldx(data, -img_width);
973    DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
974
975    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
976    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
977    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
978
979    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
980    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
981    is_less_than       = is_less_than_alpha & is_less_than_beta;
982    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
983    is_less_than       = is_less_than_beta & is_less_than;
984
985    if (__lasx_xbnz_v(is_less_than)) {
986        __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
987
988        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
989        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
990        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
991        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
992
993        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
994        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
995        DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
996        DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
997        p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
998        q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
999        __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
1000        __lasx_xvstelm_d(q0_h, data, 0, 0);
1001    }
1002}
1003
1004void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
1005                                      ptrdiff_t stride, int height,
1006                                      int log2_denom, int weight_dst,
1007                                      int weight_src, int offset_in)
1008{
1009    __m256i wgt;
1010    __m256i src0, src1, src2, src3;
1011    __m256i dst0, dst1, dst2, dst3;
1012    __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1013    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1014    __m256i denom, offset;
1015    int stride_2x = stride << 1;
1016    int stride_4x = stride << 2;
1017    int stride_3x = stride_2x + stride;
1018
1019    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1020    offset_in  += ((weight_src + weight_dst) << 7);
1021    log2_denom += 1;
1022
1023    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1024    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1025    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1026    offset = __lasx_xvreplgr2vr_h(offset_in);
1027    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1028
1029    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1030              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1031    src += stride_4x;
1032    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1033              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1034    src += stride_4x;
1035    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1036              0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1037    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1038              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1039    dst += stride_4x;
1040    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1041              dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1042    dst -= stride_4x;
1043    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1044              0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1045
1046    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1047              src0, src1, src2, src3);
1048    DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1049              dst0, dst1, dst2, dst3);
1050    DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1051              dst3, src3, vec0, vec2, vec4, vec6);
1052    DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1053              dst3, src3, vec1, vec3, vec5, vec7);
1054
1055    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1056              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1057    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1058              offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1059
1060    tmp0 = __lasx_xvsra_h(tmp0, denom);
1061    tmp1 = __lasx_xvsra_h(tmp1, denom);
1062    tmp2 = __lasx_xvsra_h(tmp2, denom);
1063    tmp3 = __lasx_xvsra_h(tmp3, denom);
1064    tmp4 = __lasx_xvsra_h(tmp4, denom);
1065    tmp5 = __lasx_xvsra_h(tmp5, denom);
1066    tmp6 = __lasx_xvsra_h(tmp6, denom);
1067    tmp7 = __lasx_xvsra_h(tmp7, denom);
1068
1069    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1070                                  tmp0, tmp1, tmp2, tmp3);
1071    DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1072                                  tmp4, tmp5, tmp6, tmp7);
1073    DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1074              dst0, dst1, dst2, dst3);
1075    __lasx_xvstelm_d(dst0, dst, 0, 0);
1076    __lasx_xvstelm_d(dst0, dst, 8, 1);
1077    dst += stride;
1078    __lasx_xvstelm_d(dst0, dst, 0, 2);
1079    __lasx_xvstelm_d(dst0, dst, 8, 3);
1080    dst += stride;
1081    __lasx_xvstelm_d(dst1, dst, 0, 0);
1082    __lasx_xvstelm_d(dst1, dst, 8, 1);
1083    dst += stride;
1084    __lasx_xvstelm_d(dst1, dst, 0, 2);
1085    __lasx_xvstelm_d(dst1, dst, 8, 3);
1086    dst += stride;
1087    __lasx_xvstelm_d(dst2, dst, 0, 0);
1088    __lasx_xvstelm_d(dst2, dst, 8, 1);
1089    dst += stride;
1090    __lasx_xvstelm_d(dst2, dst, 0, 2);
1091    __lasx_xvstelm_d(dst2, dst, 8, 3);
1092    dst += stride;
1093    __lasx_xvstelm_d(dst3, dst, 0, 0);
1094    __lasx_xvstelm_d(dst3, dst, 8, 1);
1095    dst += stride;
1096    __lasx_xvstelm_d(dst3, dst, 0, 2);
1097    __lasx_xvstelm_d(dst3, dst, 8, 3);
1098    dst += stride;
1099
1100    if (16 == height) {
1101        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1102                  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1103        src += stride_4x;
1104        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1105                  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1106        src += stride_4x;
1107        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1108                  tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1109        DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1110                  dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1111        dst += stride_4x;
1112        DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1113                  dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1114        dst -= stride_4x;
1115        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1116                  tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1117
1118        DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1119                  src0, src1, src2, src3);
1120        DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1121                  dst0, dst1, dst2, dst3);
1122        DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1123                  dst3, src3, vec0, vec2, vec4, vec6);
1124        DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1125                  dst3, src3, vec1, vec3, vec5, vec7);
1126
1127        DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1128                  offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1129        DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1130                  offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1131
1132        tmp0 = __lasx_xvsra_h(tmp0, denom);
1133        tmp1 = __lasx_xvsra_h(tmp1, denom);
1134        tmp2 = __lasx_xvsra_h(tmp2, denom);
1135        tmp3 = __lasx_xvsra_h(tmp3, denom);
1136        tmp4 = __lasx_xvsra_h(tmp4, denom);
1137        tmp5 = __lasx_xvsra_h(tmp5, denom);
1138        tmp6 = __lasx_xvsra_h(tmp6, denom);
1139        tmp7 = __lasx_xvsra_h(tmp7, denom);
1140
1141        DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1142                                      tmp0, tmp1, tmp2, tmp3);
1143        DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1144                                      tmp4, tmp5, tmp6, tmp7);
1145        DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
1146                  tmp6, dst0, dst1, dst2, dst3);
1147        __lasx_xvstelm_d(dst0, dst, 0, 0);
1148        __lasx_xvstelm_d(dst0, dst, 8, 1);
1149        dst += stride;
1150        __lasx_xvstelm_d(dst0, dst, 0, 2);
1151        __lasx_xvstelm_d(dst0, dst, 8, 3);
1152        dst += stride;
1153        __lasx_xvstelm_d(dst1, dst, 0, 0);
1154        __lasx_xvstelm_d(dst1, dst, 8, 1);
1155        dst += stride;
1156        __lasx_xvstelm_d(dst1, dst, 0, 2);
1157        __lasx_xvstelm_d(dst1, dst, 8, 3);
1158        dst += stride;
1159        __lasx_xvstelm_d(dst2, dst, 0, 0);
1160        __lasx_xvstelm_d(dst2, dst, 8, 1);
1161        dst += stride;
1162        __lasx_xvstelm_d(dst2, dst, 0, 2);
1163        __lasx_xvstelm_d(dst2, dst, 8, 3);
1164        dst += stride;
1165        __lasx_xvstelm_d(dst3, dst, 0, 0);
1166        __lasx_xvstelm_d(dst3, dst, 8, 1);
1167        dst += stride;
1168        __lasx_xvstelm_d(dst3, dst, 0, 2);
1169        __lasx_xvstelm_d(dst3, dst, 8, 3);
1170    }
1171}
1172
1173static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1174                               int32_t log2_denom, int32_t weight_src,
1175                               int32_t weight_dst, int32_t offset_in)
1176{
1177    __m256i wgt, vec0, vec1;
1178    __m256i src0, dst0;
1179    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1180    ptrdiff_t stride_2x = stride << 1;
1181    ptrdiff_t stride_3x = stride_2x + stride;
1182
1183    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1184    offset_in  += ((weight_src + weight_dst) << 7);
1185    log2_denom += 1;
1186
1187    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1188    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1189    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1190    offset = __lasx_xvreplgr2vr_h(offset_in);
1191    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1192
1193    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1194              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1195    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1196    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1197    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1198              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1199    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1200    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1201    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1202    vec0 = __lasx_xvilvl_b(dst0, src0);
1203    vec1 = __lasx_xvilvh_b(dst0, src0);
1204    DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1205              tmp0, tmp1);
1206    tmp0 = __lasx_xvsra_h(tmp0, denom);
1207    tmp1 = __lasx_xvsra_h(tmp1, denom);
1208    DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1209    dst0 = __lasx_xvpickev_b(tmp1, tmp0);
1210    __lasx_xvstelm_d(dst0, dst, 0, 0);
1211    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1212    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1213    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1214}
1215
1216static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1217                               int32_t log2_denom, int32_t weight_src,
1218                               int32_t weight_dst, int32_t offset_in)
1219{
1220    __m256i wgt, vec0, vec1, vec2, vec3;
1221    __m256i src0, src1, dst0, dst1;
1222    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1223    ptrdiff_t stride_2x = stride << 1;
1224    ptrdiff_t stride_4x = stride << 2;
1225    ptrdiff_t stride_3x = stride_2x + stride;
1226    uint8_t* dst_tmp = dst;
1227
1228    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1229    offset_in  += ((weight_src + weight_dst) << 7);
1230    log2_denom += 1;
1231
1232    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1233    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1234    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1235    offset = __lasx_xvreplgr2vr_h(offset_in);
1236    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1237
1238    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1239              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1240    src += stride_4x;
1241    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1242    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1243    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1244              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1245    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1246    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1247    tmp0 = __lasx_xvld(dst_tmp, 0);
1248    DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2);
1249    tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
1250    dst_tmp += stride_4x;
1251    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1252    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1253    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1254              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1255    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1256    dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1257
1258    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128,
1259              src0, src1, dst0, dst1);
1260    DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2);
1261    DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3);
1262    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1263              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1264    tmp0 = __lasx_xvsra_h(tmp0, denom);
1265    tmp1 = __lasx_xvsra_h(tmp1, denom);
1266    tmp2 = __lasx_xvsra_h(tmp2, denom);
1267    tmp3 = __lasx_xvsra_h(tmp3, denom);
1268    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1269                                  tmp0, tmp1, tmp2, tmp3);
1270    DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
1271    __lasx_xvstelm_d(dst0, dst, 0, 0);
1272    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1273    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1274    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1275    dst += stride_4x;
1276    __lasx_xvstelm_d(dst1, dst, 0, 0);
1277    __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1278    __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1279    __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1280}
1281
1282static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1283                                int32_t log2_denom, int32_t weight_src,
1284                                int32_t weight_dst, int32_t offset_in)
1285{
1286    __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1287    __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1288    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1289    ptrdiff_t stride_2x = stride << 1;
1290    ptrdiff_t stride_4x = stride << 2;
1291    ptrdiff_t stride_3x = stride_2x + stride;
1292    uint8_t* dst_tmp = dst;
1293
1294    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1295    offset_in  += ((weight_src + weight_dst) << 7);
1296    log2_denom += 1;
1297
1298    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1299    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1300    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1301    offset = __lasx_xvreplgr2vr_h(offset_in);
1302    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1303
1304    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1305              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1306    src += stride_4x;
1307    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1308    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1309    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1310              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1311    src += stride_4x;
1312    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1313    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1314    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1315              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1316    src += stride_4x;
1317    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1318    src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1319    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1320              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1321    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1322    src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1323
1324    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1325              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1326    dst_tmp += stride_4x;
1327    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1328    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1329    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1330              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1331    dst_tmp += stride_4x;
1332    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1333    dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1334    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1335              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1336    dst_tmp += stride_4x;
1337    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1338    dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1339    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1340              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1341    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1342    dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1343
1344    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1345              src0, src1, src2, src3);
1346    DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1347              dst0, dst1, dst2, dst3);
1348    DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1349              dst3, src3, vec0, vec2, vec4, vec6);
1350    DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1351              dst3, src3, vec1, vec3, vec5, vec7);
1352    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1353              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1354    DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5,
1355              offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1356    tmp0 = __lasx_xvsra_h(tmp0, denom);
1357    tmp1 = __lasx_xvsra_h(tmp1, denom);
1358    tmp2 = __lasx_xvsra_h(tmp2, denom);
1359    tmp3 = __lasx_xvsra_h(tmp3, denom);
1360    tmp4 = __lasx_xvsra_h(tmp4, denom);
1361    tmp5 = __lasx_xvsra_h(tmp5, denom);
1362    tmp6 = __lasx_xvsra_h(tmp6, denom);
1363    tmp7 = __lasx_xvsra_h(tmp7, denom);
1364    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1365                                  tmp0, tmp1, tmp2, tmp3);
1366    DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1367                                  tmp4, tmp5, tmp6, tmp7);
1368    DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1369                   dst0, dst1, dst2, dst3)
1370    __lasx_xvstelm_d(dst0, dst, 0, 0);
1371    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1372    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1373    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1374    dst += stride_4x;
1375    __lasx_xvstelm_d(dst1, dst, 0, 0);
1376    __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1377    __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1378    __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1379    dst += stride_4x;
1380    __lasx_xvstelm_d(dst2, dst, 0, 0);
1381    __lasx_xvstelm_d(dst2, dst + stride, 0, 1);
1382    __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
1383    __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
1384    dst += stride_4x;
1385    __lasx_xvstelm_d(dst3, dst, 0, 0);
1386    __lasx_xvstelm_d(dst3, dst + stride, 0, 1);
1387    __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
1388    __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
1389}
1390
1391void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
1392                                     ptrdiff_t stride, int height,
1393                                     int log2_denom, int weight_dst,
1394                                     int weight_src, int offset)
1395{
1396    if (4 == height) {
1397        avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1398                           offset);
1399    } else if (8 == height) {
1400        avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1401                           offset);
1402    } else {
1403        avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1404                            offset);
1405    }
1406}
1407
1408static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1409                               int32_t log2_denom, int32_t weight_src,
1410                               int32_t weight_dst, int32_t offset_in)
1411{
1412    __m256i wgt, vec0;
1413    __m256i src0, dst0;
1414    __m256i tmp0, tmp1, denom, offset;
1415
1416    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1417    offset_in  += ((weight_src + weight_dst) << 7);
1418    log2_denom += 1;
1419
1420    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1421    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1422    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1423    offset = __lasx_xvreplgr2vr_h(offset_in);
1424    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1425
1426    DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1427    src0 = __lasx_xvilvl_w(tmp1, tmp0);
1428    DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1);
1429    dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1430    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1431    vec0 = __lasx_xvilvl_b(dst0, src0);
1432    tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1433    tmp0 = __lasx_xvsra_h(tmp0, denom);
1434    tmp0 = __lasx_xvclip255_h(tmp0);
1435    tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1436    __lasx_xvstelm_w(tmp0, dst, 0, 0);
1437    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1438}
1439
1440static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1441                               int32_t log2_denom, int32_t weight_src,
1442                               int32_t weight_dst, int32_t offset_in)
1443{
1444    __m256i wgt, vec0;
1445    __m256i src0, dst0;
1446    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1447    ptrdiff_t stride_2x = stride << 1;
1448    ptrdiff_t stride_3x = stride_2x + stride;
1449
1450    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1451    offset_in  += ((weight_src + weight_dst) << 7);
1452    log2_denom += 1;
1453
1454    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1455    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1456    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1457    offset = __lasx_xvreplgr2vr_h(offset_in);
1458    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1459
1460    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1461              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1462    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1463    src0 = __lasx_xvilvl_w(tmp1, tmp0);
1464    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1465              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1466    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1467    dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1468    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1469    vec0 = __lasx_xvilvl_b(dst0, src0);
1470    dst0 = __lasx_xvilvh_b(dst0, src0);
1471    vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
1472    tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1473    tmp0 = __lasx_xvsra_h(tmp0, denom);
1474    tmp0 = __lasx_xvclip255_h(tmp0);
1475    tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1476    __lasx_xvstelm_w(tmp0, dst, 0, 0);
1477    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1478    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
1479    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
1480}
1481
1482static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1483                               int32_t log2_denom, int32_t weight_src,
1484                               int32_t weight_dst, int32_t offset_in)
1485{
1486    __m256i wgt, vec0, vec1;
1487    __m256i src0, dst0;
1488    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1489    ptrdiff_t stride_2x = stride << 1;
1490    ptrdiff_t stride_4x = stride << 2;
1491    ptrdiff_t stride_3x = stride_2x + stride;
1492
1493    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1494    offset_in  += ((weight_src + weight_dst) << 7);
1495    log2_denom += 1;
1496
1497    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1498    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1499    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1500    offset = __lasx_xvreplgr2vr_h(offset_in);
1501    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1502
1503    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1504              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1505    src += stride_4x;
1506    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1507              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1508    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1509              tmp0, tmp1, tmp2, tmp3);
1510    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1511    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1512    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1513              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1514    dst += stride_4x;
1515    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1516              dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1517    dst -= stride_4x;
1518    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1519              tmp0, tmp1, tmp2, tmp3);
1520    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1521    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1522    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1523    vec0 = __lasx_xvilvl_b(dst0, src0);
1524    vec1 = __lasx_xvilvh_b(dst0, src0);
1525    DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1526              tmp0, tmp1);
1527    tmp0 = __lasx_xvsra_h(tmp0, denom);
1528    tmp1 = __lasx_xvsra_h(tmp1, denom);
1529    DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1530    tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
1531    __lasx_xvstelm_w(tmp0, dst, 0, 0);
1532    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1533    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
1534    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
1535    dst += stride_4x;
1536    __lasx_xvstelm_w(tmp0, dst, 0, 4);
1537    __lasx_xvstelm_w(tmp0, dst + stride, 0, 5);
1538    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
1539    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
1540}
1541
1542void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
1543                                     ptrdiff_t stride, int height,
1544                                     int log2_denom, int weight_dst,
1545                                     int weight_src, int offset)
1546{
1547    if (2 == height) {
1548        avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src,
1549                           weight_dst, offset);
1550    } else if (4 == height) {
1551        avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src,
1552                           weight_dst, offset);
1553    } else {
1554        avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src,
1555                           weight_dst, offset);
1556    }
1557}
1558
1559void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
1560                                    int height, int log2_denom,
1561                                    int weight_src, int offset_in)
1562{
1563    uint32_t offset_val;
1564    ptrdiff_t stride_2x = stride << 1;
1565    ptrdiff_t stride_4x = stride << 2;
1566    ptrdiff_t stride_3x = stride_2x + stride;
1567    __m256i zero = __lasx_xvldi(0);
1568    __m256i src0, src1, src2, src3;
1569    __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
1570    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1571    __m256i wgt, denom, offset;
1572
1573    offset_val = (unsigned) offset_in << log2_denom;
1574
1575    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1576    offset = __lasx_xvreplgr2vr_h(offset_val);
1577    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1578
1579    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1580              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1581    src += stride_4x;
1582    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1583              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1584    src -= stride_4x;
1585    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1586              0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1587    DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1588              zero, src3, src0_l, src1_l, src2_l, src3_l);
1589    DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1590              zero, src3, src0_h, src1_h, src2_h, src3_h);
1591    src0_l = __lasx_xvmul_h(wgt, src0_l);
1592    src0_h = __lasx_xvmul_h(wgt, src0_h);
1593    src1_l = __lasx_xvmul_h(wgt, src1_l);
1594    src1_h = __lasx_xvmul_h(wgt, src1_h);
1595    src2_l = __lasx_xvmul_h(wgt, src2_l);
1596    src2_h = __lasx_xvmul_h(wgt, src2_h);
1597    src3_l = __lasx_xvmul_h(wgt, src3_l);
1598    src3_h = __lasx_xvmul_h(wgt, src3_h);
1599    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1600              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1601    DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1602              src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1603    src0_l = __lasx_xvmaxi_h(src0_l, 0);
1604    src0_h = __lasx_xvmaxi_h(src0_h, 0);
1605    src1_l = __lasx_xvmaxi_h(src1_l, 0);
1606    src1_h = __lasx_xvmaxi_h(src1_h, 0);
1607    src2_l = __lasx_xvmaxi_h(src2_l, 0);
1608    src2_h = __lasx_xvmaxi_h(src2_h, 0);
1609    src3_l = __lasx_xvmaxi_h(src3_l, 0);
1610    src3_h = __lasx_xvmaxi_h(src3_h, 0);
1611    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1612    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1613    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1614    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1615    src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1616    src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1617    src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1618    src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1619    __lasx_xvstelm_d(src0_l, src, 0, 0);
1620    __lasx_xvstelm_d(src0_h, src, 8, 0);
1621    src += stride;
1622    __lasx_xvstelm_d(src0_l, src, 0, 2);
1623    __lasx_xvstelm_d(src0_h, src, 8, 2);
1624    src += stride;
1625    __lasx_xvstelm_d(src1_l, src, 0, 0);
1626    __lasx_xvstelm_d(src1_h, src, 8, 0);
1627    src += stride;
1628    __lasx_xvstelm_d(src1_l, src, 0, 2);
1629    __lasx_xvstelm_d(src1_h, src, 8, 2);
1630    src += stride;
1631    __lasx_xvstelm_d(src2_l, src, 0, 0);
1632    __lasx_xvstelm_d(src2_h, src, 8, 0);
1633    src += stride;
1634    __lasx_xvstelm_d(src2_l, src, 0, 2);
1635    __lasx_xvstelm_d(src2_h, src, 8, 2);
1636    src += stride;
1637    __lasx_xvstelm_d(src3_l, src, 0, 0);
1638    __lasx_xvstelm_d(src3_h, src, 8, 0);
1639    src += stride;
1640    __lasx_xvstelm_d(src3_l, src, 0, 2);
1641    __lasx_xvstelm_d(src3_h, src, 8, 2);
1642    src += stride;
1643
1644    if (16 == height) {
1645        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1646                  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1647        src += stride_4x;
1648        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1649                  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1650        src -= stride_4x;
1651        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1652                  tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1653        DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1654                  zero, src3, src0_l, src1_l, src2_l, src3_l);
1655        DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1656                  zero, src3, src0_h, src1_h, src2_h, src3_h);
1657        src0_l = __lasx_xvmul_h(wgt, src0_l);
1658        src0_h = __lasx_xvmul_h(wgt, src0_h);
1659        src1_l = __lasx_xvmul_h(wgt, src1_l);
1660        src1_h = __lasx_xvmul_h(wgt, src1_h);
1661        src2_l = __lasx_xvmul_h(wgt, src2_l);
1662        src2_h = __lasx_xvmul_h(wgt, src2_h);
1663        src3_l = __lasx_xvmul_h(wgt, src3_l);
1664        src3_h = __lasx_xvmul_h(wgt, src3_h);
1665        DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l,
1666                  offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1667        DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l,
1668                  offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1669        src0_l = __lasx_xvmaxi_h(src0_l, 0);
1670        src0_h = __lasx_xvmaxi_h(src0_h, 0);
1671        src1_l = __lasx_xvmaxi_h(src1_l, 0);
1672        src1_h = __lasx_xvmaxi_h(src1_h, 0);
1673        src2_l = __lasx_xvmaxi_h(src2_l, 0);
1674        src2_h = __lasx_xvmaxi_h(src2_h, 0);
1675        src3_l = __lasx_xvmaxi_h(src3_l, 0);
1676        src3_h = __lasx_xvmaxi_h(src3_h, 0);
1677        src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1678        src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1679        src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1680        src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1681        src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1682        src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1683        src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1684        src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1685        __lasx_xvstelm_d(src0_l, src, 0, 0);
1686        __lasx_xvstelm_d(src0_h, src, 8, 0);
1687        src += stride;
1688        __lasx_xvstelm_d(src0_l, src, 0, 2);
1689        __lasx_xvstelm_d(src0_h, src, 8, 2);
1690        src += stride;
1691        __lasx_xvstelm_d(src1_l, src, 0, 0);
1692        __lasx_xvstelm_d(src1_h, src, 8, 0);
1693        src += stride;
1694        __lasx_xvstelm_d(src1_l, src, 0, 2);
1695        __lasx_xvstelm_d(src1_h, src, 8, 2);
1696        src += stride;
1697        __lasx_xvstelm_d(src2_l, src, 0, 0);
1698        __lasx_xvstelm_d(src2_h, src, 8, 0);
1699        src += stride;
1700        __lasx_xvstelm_d(src2_l, src, 0, 2);
1701        __lasx_xvstelm_d(src2_h, src, 8, 2);
1702        src += stride;
1703        __lasx_xvstelm_d(src3_l, src, 0, 0);
1704        __lasx_xvstelm_d(src3_h, src, 8, 0);
1705        src += stride;
1706        __lasx_xvstelm_d(src3_l, src, 0, 2);
1707        __lasx_xvstelm_d(src3_h, src, 8, 2);
1708    }
1709}
1710
1711static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride,
1712                             int32_t log2_denom, int32_t weight_src,
1713                             int32_t offset_in)
1714{
1715    uint32_t offset_val;
1716    ptrdiff_t stride_2x = stride << 1;
1717    ptrdiff_t stride_3x = stride_2x + stride;
1718    __m256i wgt, zero = __lasx_xvldi(0);
1719    __m256i src0, src0_h, src0_l;
1720    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1721
1722    offset_val = (unsigned) offset_in << log2_denom;
1723
1724    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1725    offset = __lasx_xvreplgr2vr_h(offset_val);
1726    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1727
1728    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1729              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1730    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1731    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1732    src0_l = __lasx_xvilvl_b(zero, src0);
1733    src0_h = __lasx_xvilvh_b(zero, src0);
1734    src0_l = __lasx_xvmul_h(wgt, src0_l);
1735    src0_h = __lasx_xvmul_h(wgt, src0_h);
1736    src0_l = __lasx_xvsadd_h(src0_l, offset);
1737    src0_h = __lasx_xvsadd_h(src0_h, offset);
1738    src0_l = __lasx_xvmaxi_h(src0_l, 0);
1739    src0_h = __lasx_xvmaxi_h(src0_h, 0);
1740    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1741    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1742
1743    src0 = __lasx_xvpickev_d(src0_h, src0_l);
1744    __lasx_xvstelm_d(src0, src, 0, 0);
1745    __lasx_xvstelm_d(src0, src + stride, 0, 1);
1746    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1747    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1748}
1749
1750static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom,
1751                             int32_t src_weight, int32_t offset_in)
1752{
1753    __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0);
1754    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1755    uint32_t offset_val;
1756    uint8_t* src_tmp = src;
1757    ptrdiff_t stride_2x = stride << 1;
1758    ptrdiff_t stride_4x = stride << 2;
1759    ptrdiff_t stride_3x = stride_2x + stride;
1760
1761    offset_val = (unsigned) offset_in << log2_denom;
1762
1763    wgt    = __lasx_xvreplgr2vr_h(src_weight);
1764    offset = __lasx_xvreplgr2vr_h(offset_val);
1765    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1766
1767    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1768              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1769    src_tmp += stride_4x;
1770    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1771    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1772    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1773              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1774    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1775    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1776    DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l);
1777    DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h);
1778    src0_l = __lasx_xvmul_h(wgt, src0_l);
1779    src0_h = __lasx_xvmul_h(wgt, src0_h);
1780    src1_l = __lasx_xvmul_h(wgt, src1_l);
1781    src1_h = __lasx_xvmul_h(wgt, src1_h);
1782    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1783              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1784    src0_l = __lasx_xvmaxi_h(src0_l, 0);
1785    src0_h = __lasx_xvmaxi_h(src0_h, 0);
1786    src1_l = __lasx_xvmaxi_h(src1_l, 0);
1787    src1_h = __lasx_xvmaxi_h(src1_h, 0);
1788    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1789    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1790    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1791    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1792
1793    DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1);
1794    __lasx_xvstelm_d(src0, src, 0, 0);
1795    __lasx_xvstelm_d(src0, src + stride, 0, 1);
1796    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1797    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1798    src += stride_4x;
1799    __lasx_xvstelm_d(src1, src, 0, 0);
1800    __lasx_xvstelm_d(src1, src + stride, 0, 1);
1801    __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1802    __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1803}
1804
1805static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride,
1806                              int32_t log2_denom, int32_t src_weight,
1807                              int32_t offset_in)
1808{
1809    __m256i src0, src1, src2, src3;
1810    __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
1811    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1812    __m256i zero = __lasx_xvldi(0);
1813    uint32_t offset_val;
1814    uint8_t* src_tmp = src;
1815    ptrdiff_t stride_2x = stride << 1;
1816    ptrdiff_t stride_4x = stride << 2;
1817    ptrdiff_t stride_3x = stride_2x + stride;
1818
1819    offset_val = (unsigned) offset_in << log2_denom;
1820
1821    wgt    = __lasx_xvreplgr2vr_h(src_weight);
1822    offset = __lasx_xvreplgr2vr_h(offset_val);
1823    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1824
1825    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1826              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1827    src_tmp += stride_4x;
1828    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1829    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1830    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1831              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1832    src_tmp += stride_4x;
1833    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1834    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1835    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1836              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1837    src_tmp += stride_4x;
1838    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1839    src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1840    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1841              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1842    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1843    src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1844
1845    DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3,
1846              src0_l, src1_l, src2_l, src3_l);
1847    DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
1848              src0_h, src1_h, src2_h, src3_h);
1849    src0_l = __lasx_xvmul_h(wgt, src0_l);
1850    src0_h = __lasx_xvmul_h(wgt, src0_h);
1851    src1_l = __lasx_xvmul_h(wgt, src1_l);
1852    src1_h = __lasx_xvmul_h(wgt, src1_h);
1853    src2_l = __lasx_xvmul_h(wgt, src2_l);
1854    src2_h = __lasx_xvmul_h(wgt, src2_h);
1855    src3_l = __lasx_xvmul_h(wgt, src3_l);
1856    src3_h = __lasx_xvmul_h(wgt, src3_h);
1857
1858    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1859              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1860    DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1861              src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1862
1863    src0_l = __lasx_xvmaxi_h(src0_l, 0);
1864    src0_h = __lasx_xvmaxi_h(src0_h, 0);
1865    src1_l = __lasx_xvmaxi_h(src1_l, 0);
1866    src1_h = __lasx_xvmaxi_h(src1_h, 0);
1867    src2_l = __lasx_xvmaxi_h(src2_l, 0);
1868    src2_h = __lasx_xvmaxi_h(src2_h, 0);
1869    src3_l = __lasx_xvmaxi_h(src3_l, 0);
1870    src3_h = __lasx_xvmaxi_h(src3_h, 0);
1871    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1872    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1873    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1874    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1875    src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1876    src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1877    src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1878    src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1879    DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
1880              src3_h, src3_l, src0, src1, src2, src3);
1881
1882    __lasx_xvstelm_d(src0, src, 0, 0);
1883    __lasx_xvstelm_d(src0, src + stride, 0, 1);
1884    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1885    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1886    src += stride_4x;
1887    __lasx_xvstelm_d(src1, src, 0, 0);
1888    __lasx_xvstelm_d(src1, src + stride, 0, 1);
1889    __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1890    __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1891    src += stride_4x;
1892    __lasx_xvstelm_d(src2, src, 0, 0);
1893    __lasx_xvstelm_d(src2, src + stride, 0, 1);
1894    __lasx_xvstelm_d(src2, src + stride_2x, 0, 2);
1895    __lasx_xvstelm_d(src2, src + stride_3x, 0, 3);
1896    src += stride_4x;
1897    __lasx_xvstelm_d(src3, src, 0, 0);
1898    __lasx_xvstelm_d(src3, src + stride, 0, 1);
1899    __lasx_xvstelm_d(src3, src + stride_2x, 0, 2);
1900    __lasx_xvstelm_d(src3, src + stride_3x, 0, 3);
1901}
1902
1903void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
1904                                   int height, int log2_denom,
1905                                   int weight_src, int offset)
1906{
1907    if (4 == height) {
1908        avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset);
1909    } else if (8 == height) {
1910        avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset);
1911    } else {
1912        avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset);
1913    }
1914}
1915
1916static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride,
1917                             int32_t log2_denom, int32_t weight_src,
1918                             int32_t offset_in)
1919{
1920    uint32_t offset_val;
1921    __m256i wgt, zero = __lasx_xvldi(0);
1922    __m256i src0, tmp0, tmp1, denom, offset;
1923
1924    offset_val = (unsigned) offset_in << log2_denom;
1925
1926    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1927    offset = __lasx_xvreplgr2vr_h(offset_val);
1928    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1929
1930    DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1931    src0 = __lasx_xvilvl_w(tmp1, tmp0);
1932    src0 = __lasx_xvilvl_b(zero, src0);
1933    src0 = __lasx_xvmul_h(wgt, src0);
1934    src0 = __lasx_xvsadd_h(src0, offset);
1935    src0 = __lasx_xvmaxi_h(src0, 0);
1936    src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1937    __lasx_xvstelm_w(src0, src, 0, 0);
1938    __lasx_xvstelm_w(src0, src + stride, 0, 1);
1939}
1940
1941static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride,
1942                             int32_t log2_denom, int32_t weight_src,
1943                             int32_t offset_in)
1944{
1945    __m256i wgt;
1946    __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset;
1947    uint32_t offset_val;
1948    ptrdiff_t stride_2x = stride << 1;
1949    ptrdiff_t stride_3x = stride_2x + stride;
1950
1951    offset_val = (unsigned) offset_in << log2_denom;
1952
1953    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1954    offset = __lasx_xvreplgr2vr_h(offset_val);
1955    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1956
1957    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1958              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1959    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1960    src0 = __lasx_xvilvl_w(tmp1, tmp0);
1961    src0 = __lasx_vext2xv_hu_bu(src0);
1962    src0 = __lasx_xvmul_h(wgt, src0);
1963    src0 = __lasx_xvsadd_h(src0, offset);
1964    src0 = __lasx_xvmaxi_h(src0, 0);
1965    src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1966    __lasx_xvstelm_w(src0, src, 0, 0);
1967    __lasx_xvstelm_w(src0, src + stride, 0, 1);
1968    __lasx_xvstelm_w(src0, src + stride_2x, 0, 4);
1969    __lasx_xvstelm_w(src0, src + stride_3x, 0, 5);
1970}
1971
1972static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride,
1973                             int32_t log2_denom, int32_t weight_src,
1974                             int32_t offset_in)
1975{
1976    __m256i src0, src0_h, src0_l;
1977    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1978    __m256i wgt, zero = __lasx_xvldi(0);
1979    uint32_t offset_val;
1980    ptrdiff_t stride_2x = stride << 1;
1981    ptrdiff_t stride_4x = stride << 2;
1982    ptrdiff_t stride_3x = stride_2x + stride;
1983
1984    offset_val = (unsigned) offset_in << log2_denom;
1985
1986    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1987    offset = __lasx_xvreplgr2vr_h(offset_val);
1988    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1989
1990    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1991              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1992    src += stride_4x;
1993    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1994              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1995    src -= stride_4x;
1996    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
1997              tmp5, tmp0, tmp1, tmp2, tmp3);
1998    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1999    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
2000    src0_l = __lasx_xvilvl_b(zero, src0);
2001    src0_h = __lasx_xvilvh_b(zero, src0);
2002    src0_l = __lasx_xvmul_h(wgt, src0_l);
2003    src0_h = __lasx_xvmul_h(wgt, src0_h);
2004    src0_l = __lasx_xvsadd_h(src0_l, offset);
2005    src0_h = __lasx_xvsadd_h(src0_h, offset);
2006    src0_l = __lasx_xvmaxi_h(src0_l, 0);
2007    src0_h = __lasx_xvmaxi_h(src0_h, 0);
2008    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
2009    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
2010    __lasx_xvstelm_w(src0_l, src, 0, 0);
2011    __lasx_xvstelm_w(src0_l, src + stride, 0, 1);
2012    __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0);
2013    __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1);
2014    src += stride_4x;
2015    __lasx_xvstelm_w(src0_l, src, 0, 4);
2016    __lasx_xvstelm_w(src0_l, src + stride, 0, 5);
2017    __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4);
2018    __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5);
2019}
2020
2021void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
2022                                   int height, int log2_denom,
2023                                   int weight_src, int offset)
2024{
2025    if (2 == height) {
2026        avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset);
2027    } else if (4 == height) {
2028        avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset);
2029    } else {
2030        avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset);
2031    }
2032}
2033
2034void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2035{
2036    __m256i src0, dst0, dst1, dst2, dst3, zero;
2037    __m256i tmp0, tmp1;
2038    uint8_t* _dst1 = _dst + stride;
2039    uint8_t* _dst2 = _dst1 + stride;
2040    uint8_t* _dst3 = _dst2 + stride;
2041
2042    src0 = __lasx_xvld(_src, 0);
2043    dst0 = __lasx_xvldrepl_w(_dst, 0);
2044    dst1 = __lasx_xvldrepl_w(_dst1, 0);
2045    dst2 = __lasx_xvldrepl_w(_dst2, 0);
2046    dst3 = __lasx_xvldrepl_w(_dst3, 0);
2047    tmp0 = __lasx_xvilvl_w(dst1, dst0);
2048    tmp1 = __lasx_xvilvl_w(dst3, dst2);
2049    dst0 = __lasx_xvilvl_d(tmp1, tmp0);
2050    tmp0 = __lasx_vext2xv_hu_bu(dst0);
2051    zero = __lasx_xvldi(0);
2052    tmp1 = __lasx_xvadd_h(src0, tmp0);
2053    dst0 = __lasx_xvpickev_b(tmp1, tmp1);
2054    __lasx_xvstelm_w(dst0, _dst, 0, 0);
2055    __lasx_xvstelm_w(dst0, _dst1, 0, 1);
2056    __lasx_xvstelm_w(dst0, _dst2, 0, 4);
2057    __lasx_xvstelm_w(dst0, _dst3, 0, 5);
2058    __lasx_xvst(zero, _src, 0);
2059}
2060
2061void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2062{
2063    __m256i src0, src1, src2, src3;
2064    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2065    __m256i tmp0, tmp1, tmp2, tmp3;
2066    __m256i zero = __lasx_xvldi(0);
2067    uint8_t *_dst1 = _dst + stride;
2068    uint8_t *_dst2 = _dst1 + stride;
2069    uint8_t *_dst3 = _dst2 + stride;
2070    uint8_t *_dst4 = _dst3 + stride;
2071    uint8_t *_dst5 = _dst4 + stride;
2072    uint8_t *_dst6 = _dst5 + stride;
2073    uint8_t *_dst7 = _dst6 + stride;
2074
2075    src0 = __lasx_xvld(_src, 0);
2076    src1 = __lasx_xvld(_src, 32);
2077    src2 = __lasx_xvld(_src, 64);
2078    src3 = __lasx_xvld(_src, 96);
2079    dst0 = __lasx_xvldrepl_d(_dst, 0);
2080    dst1 = __lasx_xvldrepl_d(_dst1, 0);
2081    dst2 = __lasx_xvldrepl_d(_dst2, 0);
2082    dst3 = __lasx_xvldrepl_d(_dst3, 0);
2083    dst4 = __lasx_xvldrepl_d(_dst4, 0);
2084    dst5 = __lasx_xvldrepl_d(_dst5, 0);
2085    dst6 = __lasx_xvldrepl_d(_dst6, 0);
2086    dst7 = __lasx_xvldrepl_d(_dst7, 0);
2087    tmp0 = __lasx_xvilvl_d(dst1, dst0);
2088    tmp1 = __lasx_xvilvl_d(dst3, dst2);
2089    tmp2 = __lasx_xvilvl_d(dst5, dst4);
2090    tmp3 = __lasx_xvilvl_d(dst7, dst6);
2091    dst0 = __lasx_vext2xv_hu_bu(tmp0);
2092    dst1 = __lasx_vext2xv_hu_bu(tmp1);
2093    dst1 = __lasx_vext2xv_hu_bu(tmp1);
2094    dst2 = __lasx_vext2xv_hu_bu(tmp2);
2095    dst3 = __lasx_vext2xv_hu_bu(tmp3);
2096    tmp0 = __lasx_xvadd_h(src0, dst0);
2097    tmp1 = __lasx_xvadd_h(src1, dst1);
2098    tmp2 = __lasx_xvadd_h(src2, dst2);
2099    tmp3 = __lasx_xvadd_h(src3, dst3);
2100    dst1 = __lasx_xvpickev_b(tmp1, tmp0);
2101    dst2 = __lasx_xvpickev_b(tmp3, tmp2);
2102    __lasx_xvst(zero, _src, 0);
2103    __lasx_xvst(zero, _src, 32);
2104    __lasx_xvst(zero, _src, 64);
2105    __lasx_xvst(zero, _src, 96);
2106    __lasx_xvstelm_d(dst1, _dst, 0, 0);
2107    __lasx_xvstelm_d(dst1, _dst1, 0, 2);
2108    __lasx_xvstelm_d(dst1, _dst2, 0, 1);
2109    __lasx_xvstelm_d(dst1, _dst3, 0, 3);
2110    __lasx_xvstelm_d(dst2, _dst4, 0, 0);
2111    __lasx_xvstelm_d(dst2, _dst5, 0, 2);
2112    __lasx_xvstelm_d(dst2, _dst6, 0, 1);
2113    __lasx_xvstelm_d(dst2, _dst7, 0, 3);
2114}
2115