1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Loongson LASX optimized h264dsp
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited
5cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6cabdff1aSopenharmony_ci *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * This file is part of FFmpeg.
9cabdff1aSopenharmony_ci *
10cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci *
15cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci * Lesser General Public License for more details.
19cabdff1aSopenharmony_ci *
20cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci */
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
26cabdff1aSopenharmony_ci#include "h264dsp_lasx.h"
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_ci#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,   \
29cabdff1aSopenharmony_ci                         p1_or_q1_org_in, p2_or_q2_org_in,   \
30cabdff1aSopenharmony_ci                         neg_tc_in, tc_in, p1_or_q1_out)     \
31cabdff1aSopenharmony_ci{                                                            \
32cabdff1aSopenharmony_ci    __m256i clip3, temp;                                     \
33cabdff1aSopenharmony_ci                                                             \
34cabdff1aSopenharmony_ci    clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in,                \
35cabdff1aSopenharmony_ci                             q0_or_p0_org_in);               \
36cabdff1aSopenharmony_ci    temp = __lasx_xvslli_h(p1_or_q1_org_in, 1);              \
37cabdff1aSopenharmony_ci    clip3 = __lasx_xvsub_h(clip3, temp);                     \
38cabdff1aSopenharmony_ci    clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3);          \
39cabdff1aSopenharmony_ci    clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in);        \
40cabdff1aSopenharmony_ci    p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3);   \
41cabdff1aSopenharmony_ci}
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,       \
44cabdff1aSopenharmony_ci                     p1_or_q1_org_in, q1_or_p1_org_in,       \
45cabdff1aSopenharmony_ci                     neg_threshold_in, threshold_in,         \
46cabdff1aSopenharmony_ci                     p0_or_q0_out, q0_or_p0_out)             \
47cabdff1aSopenharmony_ci{                                                            \
48cabdff1aSopenharmony_ci    __m256i q0_sub_p0, p1_sub_q1, delta;                     \
49cabdff1aSopenharmony_ci                                                             \
50cabdff1aSopenharmony_ci    q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in,              \
51cabdff1aSopenharmony_ci                               p0_or_q0_org_in);             \
52cabdff1aSopenharmony_ci    p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in,              \
53cabdff1aSopenharmony_ci                               q1_or_p1_org_in);             \
54cabdff1aSopenharmony_ci    q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2);               \
55cabdff1aSopenharmony_ci    p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4);              \
56cabdff1aSopenharmony_ci    delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1);            \
57cabdff1aSopenharmony_ci    delta = __lasx_xvsrai_h(delta, 3);                       \
58cabdff1aSopenharmony_ci    delta = __lasx_xvclip_h(delta, neg_threshold_in,         \
59cabdff1aSopenharmony_ci           threshold_in);                                    \
60cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta);   \
61cabdff1aSopenharmony_ci    q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta);   \
62cabdff1aSopenharmony_ci                                                             \
63cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out);         \
64cabdff1aSopenharmony_ci    q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out);         \
65cabdff1aSopenharmony_ci}
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_civoid ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
68cabdff1aSopenharmony_ci                               int alpha_in, int beta_in, int8_t *tc)
69cabdff1aSopenharmony_ci{
70cabdff1aSopenharmony_ci    ptrdiff_t img_width_2x = img_width << 1;
71cabdff1aSopenharmony_ci    ptrdiff_t img_width_4x = img_width << 2;
72cabdff1aSopenharmony_ci    ptrdiff_t img_width_8x = img_width << 3;
73cabdff1aSopenharmony_ci    ptrdiff_t img_width_3x = img_width_2x + img_width;
74cabdff1aSopenharmony_ci    __m256i tmp_vec0, bs_vec;
75cabdff1aSopenharmony_ci    __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
76cabdff1aSopenharmony_ci                      0x0101010100000000, 0x0303030302020202};
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_ci    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
79cabdff1aSopenharmony_ci    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
80cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
81cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
82cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_ci    if (__lasx_xbnz_v(bs_vec)) {
85cabdff1aSopenharmony_ci        uint8_t *src = data - 4;
86cabdff1aSopenharmony_ci        __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
87cabdff1aSopenharmony_ci        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
88cabdff1aSopenharmony_ci        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
89cabdff1aSopenharmony_ci        __m256i is_bs_greater_than0;
90cabdff1aSopenharmony_ci        __m256i zero = __lasx_xvldi(0);
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci        {
95cabdff1aSopenharmony_ci            uint8_t *src_tmp = src + img_width_8x;
96cabdff1aSopenharmony_ci            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
97cabdff1aSopenharmony_ci            __m256i row8, row9, row10, row11, row12, row13, row14, row15;
98cabdff1aSopenharmony_ci
99cabdff1aSopenharmony_ci            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
100cabdff1aSopenharmony_ci                      src, img_width_3x, row0, row1, row2, row3);
101cabdff1aSopenharmony_ci            src += img_width_4x;
102cabdff1aSopenharmony_ci            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
103cabdff1aSopenharmony_ci                      src, img_width_3x, row4, row5, row6, row7);
104cabdff1aSopenharmony_ci            src -= img_width_4x;
105cabdff1aSopenharmony_ci            DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
106cabdff1aSopenharmony_ci                      img_width_2x, src_tmp, img_width_3x,
107cabdff1aSopenharmony_ci                      row8, row9, row10, row11);
108cabdff1aSopenharmony_ci            src_tmp += img_width_4x;
109cabdff1aSopenharmony_ci            DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
110cabdff1aSopenharmony_ci                      img_width_2x, src_tmp, img_width_3x,
111cabdff1aSopenharmony_ci                      row12, row13, row14, row15);
112cabdff1aSopenharmony_ci            src_tmp -= img_width_4x;
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci            LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
115cabdff1aSopenharmony_ci                                 row7, row8, row9, row10, row11,
116cabdff1aSopenharmony_ci                                 row12, row13, row14, row15,
117cabdff1aSopenharmony_ci                                 p3_org, p2_org, p1_org, p0_org,
118cabdff1aSopenharmony_ci                                 q0_org, q1_org, q2_org, q3_org);
119cabdff1aSopenharmony_ci        }
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
122cabdff1aSopenharmony_ci        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
123cabdff1aSopenharmony_ci        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
124cabdff1aSopenharmony_ci
125cabdff1aSopenharmony_ci        alpha = __lasx_xvreplgr2vr_b(alpha_in);
126cabdff1aSopenharmony_ci        beta  = __lasx_xvreplgr2vr_b(beta_in);
127cabdff1aSopenharmony_ci
128cabdff1aSopenharmony_ci        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
129cabdff1aSopenharmony_ci        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
130cabdff1aSopenharmony_ci        is_less_than       = is_less_than_alpha & is_less_than_beta;
131cabdff1aSopenharmony_ci        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
132cabdff1aSopenharmony_ci        is_less_than       = is_less_than_beta & is_less_than;
133cabdff1aSopenharmony_ci        is_less_than       = is_less_than & is_bs_greater_than0;
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_ci        if (__lasx_xbnz_v(is_less_than)) {
136cabdff1aSopenharmony_ci            __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
137cabdff1aSopenharmony_ci            __m256i p2_asub_p0, q2_asub_q0;
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci            neg_tc_h = __lasx_xvneg_b(tc_vec);
140cabdff1aSopenharmony_ci            neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
141cabdff1aSopenharmony_ci            tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
142cabdff1aSopenharmony_ci            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
143cabdff1aSopenharmony_ci            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
144cabdff1aSopenharmony_ci            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci            p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
147cabdff1aSopenharmony_ci            is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
148cabdff1aSopenharmony_ci            is_less_than_beta = is_less_than_beta & is_less_than;
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci            if (__lasx_xbnz_v(is_less_than_beta)) {
151cabdff1aSopenharmony_ci                __m256i p2_org_h, p1_h;
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ci                p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
154cabdff1aSopenharmony_ci                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
155cabdff1aSopenharmony_ci                                 neg_tc_h, tc_h, p1_h);
156cabdff1aSopenharmony_ci                p1_h = __lasx_xvpickev_b(p1_h, p1_h);
157cabdff1aSopenharmony_ci                p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
158cabdff1aSopenharmony_ci                p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
159cabdff1aSopenharmony_ci                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
160cabdff1aSopenharmony_ci                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
161cabdff1aSopenharmony_ci            }
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci            q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
164cabdff1aSopenharmony_ci            is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
165cabdff1aSopenharmony_ci            is_less_than_beta = is_less_than_beta & is_less_than;
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci            if (__lasx_xbnz_v(is_less_than_beta)) {
170cabdff1aSopenharmony_ci                __m256i q2_org_h, q1_h;
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ci                q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
173cabdff1aSopenharmony_ci                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
174cabdff1aSopenharmony_ci                                 neg_tc_h, tc_h, q1_h);
175cabdff1aSopenharmony_ci                q1_h = __lasx_xvpickev_b(q1_h, q1_h);
176cabdff1aSopenharmony_ci                q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
177cabdff1aSopenharmony_ci                q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
180cabdff1aSopenharmony_ci                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
181cabdff1aSopenharmony_ci            }
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci            {
184cabdff1aSopenharmony_ci                __m256i neg_thresh_h, p0_h, q0_h;
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci                neg_thresh_h = __lasx_xvneg_b(tc_vec);
187cabdff1aSopenharmony_ci                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
188cabdff1aSopenharmony_ci                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
189cabdff1aSopenharmony_ci
190cabdff1aSopenharmony_ci                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
191cabdff1aSopenharmony_ci                             neg_thresh_h, tc_h, p0_h, q0_h);
192cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
193cabdff1aSopenharmony_ci                          p0_h, q0_h);
194cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
195cabdff1aSopenharmony_ci                          p0_h, q0_h);
196cabdff1aSopenharmony_ci                p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
197cabdff1aSopenharmony_ci                q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
198cabdff1aSopenharmony_ci            }
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci            {
201cabdff1aSopenharmony_ci                __m256i row0, row1, row2, row3, row4, row5, row6, row7;
202cabdff1aSopenharmony_ci                __m256i control = {0x0000000400000000, 0x0000000500000001,
203cabdff1aSopenharmony_ci                                   0x0000000600000002, 0x0000000700000003};
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ci                DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
206cabdff1aSopenharmony_ci                          q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
207cabdff1aSopenharmony_ci                          q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
208cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
209cabdff1aSopenharmony_ci                          row0, row2);
210cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
211cabdff1aSopenharmony_ci                          row1, row3);
212cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
213cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
214cabdff1aSopenharmony_ci                DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
215cabdff1aSopenharmony_ci                          control, row7, control, row4, row5, row6, row7);
216cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row4, src, 0, 0);
217cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row4, src + img_width, 0, 1);
218cabdff1aSopenharmony_ci                src += img_width_2x;
219cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row4, src, 0, 2);
220cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row4, src + img_width, 0, 3);
221cabdff1aSopenharmony_ci                src += img_width_2x;
222cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row5, src, 0, 0);
223cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row5, src + img_width, 0, 1);
224cabdff1aSopenharmony_ci                src += img_width_2x;
225cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row5, src, 0, 2);
226cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row5, src + img_width, 0, 3);
227cabdff1aSopenharmony_ci                src += img_width_2x;
228cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row6, src, 0, 0);
229cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row6, src + img_width, 0, 1);
230cabdff1aSopenharmony_ci                src += img_width_2x;
231cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row6, src, 0, 2);
232cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row6, src + img_width, 0, 3);
233cabdff1aSopenharmony_ci                src += img_width_2x;
234cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row7, src, 0, 0);
235cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row7, src + img_width, 0, 1);
236cabdff1aSopenharmony_ci                src += img_width_2x;
237cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row7, src, 0, 2);
238cabdff1aSopenharmony_ci                __lasx_xvstelm_d(row7, src + img_width, 0, 3);
239cabdff1aSopenharmony_ci            }
240cabdff1aSopenharmony_ci        }
241cabdff1aSopenharmony_ci    }
242cabdff1aSopenharmony_ci}
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_civoid ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
245cabdff1aSopenharmony_ci                                   int alpha_in, int beta_in, int8_t *tc)
246cabdff1aSopenharmony_ci{
247cabdff1aSopenharmony_ci    ptrdiff_t img_width_2x = img_width << 1;
248cabdff1aSopenharmony_ci    ptrdiff_t img_width_3x = img_width + img_width_2x;
249cabdff1aSopenharmony_ci    __m256i tmp_vec0, bs_vec;
250cabdff1aSopenharmony_ci    __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
251cabdff1aSopenharmony_ci                      0x0101010100000000, 0x0303030302020202};
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
254cabdff1aSopenharmony_ci    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
255cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
256cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
257cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
258cabdff1aSopenharmony_ci
259cabdff1aSopenharmony_ci    if (__lasx_xbnz_v(bs_vec)) {
260cabdff1aSopenharmony_ci        __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
261cabdff1aSopenharmony_ci        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
262cabdff1aSopenharmony_ci        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
263cabdff1aSopenharmony_ci        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
264cabdff1aSopenharmony_ci        __m256i is_bs_greater_than0;
265cabdff1aSopenharmony_ci        __m256i zero = __lasx_xvldi(0);
266cabdff1aSopenharmony_ci
267cabdff1aSopenharmony_ci        alpha = __lasx_xvreplgr2vr_b(alpha_in);
268cabdff1aSopenharmony_ci        beta  = __lasx_xvreplgr2vr_b(beta_in);
269cabdff1aSopenharmony_ci
270cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvldx, data, -img_width_3x, data, -img_width_2x,
271cabdff1aSopenharmony_ci                  p2_org, p1_org);
272cabdff1aSopenharmony_ci        p0_org = __lasx_xvldx(data, -img_width);
273cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
274cabdff1aSopenharmony_ci
275cabdff1aSopenharmony_ci        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
276cabdff1aSopenharmony_ci        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
277cabdff1aSopenharmony_ci        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
278cabdff1aSopenharmony_ci        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
279cabdff1aSopenharmony_ci
280cabdff1aSopenharmony_ci        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
281cabdff1aSopenharmony_ci        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
282cabdff1aSopenharmony_ci        is_less_than       = is_less_than_alpha & is_less_than_beta;
283cabdff1aSopenharmony_ci        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
284cabdff1aSopenharmony_ci        is_less_than       = is_less_than_beta & is_less_than;
285cabdff1aSopenharmony_ci        is_less_than       = is_less_than & is_bs_greater_than0;
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci        if (__lasx_xbnz_v(is_less_than)) {
288cabdff1aSopenharmony_ci            __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
289cabdff1aSopenharmony_ci
290cabdff1aSopenharmony_ci            q2_org = __lasx_xvldx(data, img_width_2x);
291cabdff1aSopenharmony_ci
292cabdff1aSopenharmony_ci            neg_tc_h = __lasx_xvneg_b(tc_vec);
293cabdff1aSopenharmony_ci            neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
294cabdff1aSopenharmony_ci            tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
295cabdff1aSopenharmony_ci            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
296cabdff1aSopenharmony_ci            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
297cabdff1aSopenharmony_ci            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ci            p2_asub_p0        = __lasx_xvabsd_bu(p2_org, p0_org);
300cabdff1aSopenharmony_ci            is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
301cabdff1aSopenharmony_ci            is_less_than_beta = is_less_than_beta & is_less_than;
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci            if (__lasx_xbnz_v(is_less_than_beta)) {
304cabdff1aSopenharmony_ci                __m256i p1_h, p2_org_h;
305cabdff1aSopenharmony_ci
306cabdff1aSopenharmony_ci                p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
307cabdff1aSopenharmony_ci                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
308cabdff1aSopenharmony_ci                                 neg_tc_h, tc_h, p1_h);
309cabdff1aSopenharmony_ci                p1_h = __lasx_xvpickev_b(p1_h, p1_h);
310cabdff1aSopenharmony_ci                p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
311cabdff1aSopenharmony_ci                p1_h   = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
312cabdff1aSopenharmony_ci                p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
313cabdff1aSopenharmony_ci                __lasx_xvst(p1_org, data - img_width_2x, 0);
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_ci                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
316cabdff1aSopenharmony_ci                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
317cabdff1aSopenharmony_ci            }
318cabdff1aSopenharmony_ci
319cabdff1aSopenharmony_ci            q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
320cabdff1aSopenharmony_ci            is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
321cabdff1aSopenharmony_ci            is_less_than_beta = is_less_than_beta & is_less_than;
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_ci            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
324cabdff1aSopenharmony_ci
325cabdff1aSopenharmony_ci            if (__lasx_xbnz_v(is_less_than_beta)) {
326cabdff1aSopenharmony_ci                __m256i q1_h, q2_org_h;
327cabdff1aSopenharmony_ci
328cabdff1aSopenharmony_ci                q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
329cabdff1aSopenharmony_ci                AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
330cabdff1aSopenharmony_ci                                 neg_tc_h, tc_h, q1_h);
331cabdff1aSopenharmony_ci                q1_h = __lasx_xvpickev_b(q1_h, q1_h);
332cabdff1aSopenharmony_ci                q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
333cabdff1aSopenharmony_ci                q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
334cabdff1aSopenharmony_ci                q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
335cabdff1aSopenharmony_ci                __lasx_xvst(q1_org, data + img_width, 0);
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_ci                is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
338cabdff1aSopenharmony_ci                tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
339cabdff1aSopenharmony_ci
340cabdff1aSopenharmony_ci            }
341cabdff1aSopenharmony_ci
342cabdff1aSopenharmony_ci            {
343cabdff1aSopenharmony_ci                __m256i neg_thresh_h, p0_h, q0_h;
344cabdff1aSopenharmony_ci
345cabdff1aSopenharmony_ci                neg_thresh_h = __lasx_xvneg_b(tc_vec);
346cabdff1aSopenharmony_ci                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
347cabdff1aSopenharmony_ci                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
350cabdff1aSopenharmony_ci                             neg_thresh_h, tc_h, p0_h, q0_h);
351cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
352cabdff1aSopenharmony_ci                          p0_h, q0_h);
353cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
354cabdff1aSopenharmony_ci                          p0_h, q0_h);
355cabdff1aSopenharmony_ci                p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
356cabdff1aSopenharmony_ci                q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
357cabdff1aSopenharmony_ci                p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
358cabdff1aSopenharmony_ci                q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
359cabdff1aSopenharmony_ci                __lasx_xvst(p0_org, data - img_width, 0);
360cabdff1aSopenharmony_ci                __lasx_xvst(q0_org, data, 0);
361cabdff1aSopenharmony_ci            }
362cabdff1aSopenharmony_ci        }
363cabdff1aSopenharmony_ci    }
364cabdff1aSopenharmony_ci}
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_civoid ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
367cabdff1aSopenharmony_ci                                 int alpha_in, int beta_in, int8_t *tc)
368cabdff1aSopenharmony_ci{
369cabdff1aSopenharmony_ci    __m256i tmp_vec0, bs_vec;
370cabdff1aSopenharmony_ci    __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
371cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
372cabdff1aSopenharmony_ci    ptrdiff_t img_width_2x = img_width << 1;
373cabdff1aSopenharmony_ci    ptrdiff_t img_width_4x = img_width << 2;
374cabdff1aSopenharmony_ci    ptrdiff_t img_width_3x = img_width_2x + img_width;
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ci    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
377cabdff1aSopenharmony_ci    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
378cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
379cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
380cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
381cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvpermi_q(zero, bs_vec, 0x30);
382cabdff1aSopenharmony_ci
383cabdff1aSopenharmony_ci    if (__lasx_xbnz_v(bs_vec)) {
384cabdff1aSopenharmony_ci        uint8_t *src = data - 2;
385cabdff1aSopenharmony_ci        __m256i p1_org, p0_org, q0_org, q1_org;
386cabdff1aSopenharmony_ci        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
387cabdff1aSopenharmony_ci        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
388cabdff1aSopenharmony_ci        __m256i is_bs_greater_than0;
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ci        {
393cabdff1aSopenharmony_ci            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
396cabdff1aSopenharmony_ci                      src, img_width_3x, row0, row1, row2, row3);
397cabdff1aSopenharmony_ci            src += img_width_4x;
398cabdff1aSopenharmony_ci            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
399cabdff1aSopenharmony_ci                      src, img_width_3x, row4, row5, row6, row7);
400cabdff1aSopenharmony_ci            src -= img_width_4x;
401cabdff1aSopenharmony_ci            /* LASX_TRANSPOSE8x4_B */
402cabdff1aSopenharmony_ci            DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
403cabdff1aSopenharmony_ci                      row7, row5, p1_org, p0_org, q0_org, q1_org);
404cabdff1aSopenharmony_ci            row0 = __lasx_xvilvl_b(p0_org, p1_org);
405cabdff1aSopenharmony_ci            row1 = __lasx_xvilvl_b(q1_org, q0_org);
406cabdff1aSopenharmony_ci            row3 = __lasx_xvilvh_w(row1, row0);
407cabdff1aSopenharmony_ci            row2 = __lasx_xvilvl_w(row1, row0);
408cabdff1aSopenharmony_ci            p1_org = __lasx_xvpermi_d(row2, 0x00);
409cabdff1aSopenharmony_ci            p0_org = __lasx_xvpermi_d(row2, 0x55);
410cabdff1aSopenharmony_ci            q0_org = __lasx_xvpermi_d(row3, 0x00);
411cabdff1aSopenharmony_ci            q1_org = __lasx_xvpermi_d(row3, 0x55);
412cabdff1aSopenharmony_ci        }
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
415cabdff1aSopenharmony_ci        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
416cabdff1aSopenharmony_ci        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
417cabdff1aSopenharmony_ci
418cabdff1aSopenharmony_ci        alpha = __lasx_xvreplgr2vr_b(alpha_in);
419cabdff1aSopenharmony_ci        beta  = __lasx_xvreplgr2vr_b(beta_in);
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
422cabdff1aSopenharmony_ci        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
423cabdff1aSopenharmony_ci        is_less_than       = is_less_than_alpha & is_less_than_beta;
424cabdff1aSopenharmony_ci        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
425cabdff1aSopenharmony_ci        is_less_than       = is_less_than_beta & is_less_than;
426cabdff1aSopenharmony_ci        is_less_than       = is_less_than & is_bs_greater_than0;
427cabdff1aSopenharmony_ci
428cabdff1aSopenharmony_ci        if (__lasx_xbnz_v(is_less_than)) {
429cabdff1aSopenharmony_ci            __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
430cabdff1aSopenharmony_ci
431cabdff1aSopenharmony_ci            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
432cabdff1aSopenharmony_ci            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
433cabdff1aSopenharmony_ci            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
434cabdff1aSopenharmony_ci            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci            {
437cabdff1aSopenharmony_ci                __m256i tc_h, neg_thresh_h, p0_h, q0_h;
438cabdff1aSopenharmony_ci
439cabdff1aSopenharmony_ci                neg_thresh_h = __lasx_xvneg_b(tc_vec);
440cabdff1aSopenharmony_ci                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
441cabdff1aSopenharmony_ci                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
444cabdff1aSopenharmony_ci                             neg_thresh_h, tc_h, p0_h, q0_h);
445cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
446cabdff1aSopenharmony_ci                          p0_h, q0_h);
447cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
448cabdff1aSopenharmony_ci                          p0_h, q0_h);
449cabdff1aSopenharmony_ci                p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
450cabdff1aSopenharmony_ci                q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
451cabdff1aSopenharmony_ci            }
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci            p0_org = __lasx_xvilvl_b(q0_org, p0_org);
454cabdff1aSopenharmony_ci            src = data - 1;
455cabdff1aSopenharmony_ci            __lasx_xvstelm_h(p0_org, src, 0, 0);
456cabdff1aSopenharmony_ci            src += img_width;
457cabdff1aSopenharmony_ci            __lasx_xvstelm_h(p0_org, src, 0, 1);
458cabdff1aSopenharmony_ci            src += img_width;
459cabdff1aSopenharmony_ci            __lasx_xvstelm_h(p0_org, src, 0, 2);
460cabdff1aSopenharmony_ci            src += img_width;
461cabdff1aSopenharmony_ci            __lasx_xvstelm_h(p0_org, src, 0, 3);
462cabdff1aSopenharmony_ci            src += img_width;
463cabdff1aSopenharmony_ci            __lasx_xvstelm_h(p0_org, src, 0, 4);
464cabdff1aSopenharmony_ci            src += img_width;
465cabdff1aSopenharmony_ci            __lasx_xvstelm_h(p0_org, src, 0, 5);
466cabdff1aSopenharmony_ci            src += img_width;
467cabdff1aSopenharmony_ci            __lasx_xvstelm_h(p0_org, src, 0, 6);
468cabdff1aSopenharmony_ci            src += img_width;
469cabdff1aSopenharmony_ci            __lasx_xvstelm_h(p0_org, src, 0, 7);
470cabdff1aSopenharmony_ci        }
471cabdff1aSopenharmony_ci    }
472cabdff1aSopenharmony_ci}
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_civoid ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
475cabdff1aSopenharmony_ci                                 int alpha_in, int beta_in, int8_t *tc)
476cabdff1aSopenharmony_ci{
477cabdff1aSopenharmony_ci    int img_width_2x = img_width << 1;
478cabdff1aSopenharmony_ci    __m256i tmp_vec0, bs_vec;
479cabdff1aSopenharmony_ci    __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
480cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_ci    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
483cabdff1aSopenharmony_ci    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
484cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
485cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
486cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
487cabdff1aSopenharmony_ci    bs_vec   = __lasx_xvpermi_q(zero, bs_vec, 0x30);
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_ci    if (__lasx_xbnz_v(bs_vec)) {
490cabdff1aSopenharmony_ci        __m256i p1_org, p0_org, q0_org, q1_org;
491cabdff1aSopenharmony_ci        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
492cabdff1aSopenharmony_ci        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
493cabdff1aSopenharmony_ci        __m256i is_bs_greater_than0;
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_ci        alpha = __lasx_xvreplgr2vr_b(alpha_in);
496cabdff1aSopenharmony_ci        beta  = __lasx_xvreplgr2vr_b(beta_in);
497cabdff1aSopenharmony_ci
498cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width,
499cabdff1aSopenharmony_ci                  p1_org, p0_org);
500cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
501cabdff1aSopenharmony_ci
502cabdff1aSopenharmony_ci        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
503cabdff1aSopenharmony_ci        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
504cabdff1aSopenharmony_ci        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
505cabdff1aSopenharmony_ci        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
508cabdff1aSopenharmony_ci        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
509cabdff1aSopenharmony_ci        is_less_than       = is_less_than_alpha & is_less_than_beta;
510cabdff1aSopenharmony_ci        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
511cabdff1aSopenharmony_ci        is_less_than       = is_less_than_beta & is_less_than;
512cabdff1aSopenharmony_ci        is_less_than       = is_less_than & is_bs_greater_than0;
513cabdff1aSopenharmony_ci
514cabdff1aSopenharmony_ci        if (__lasx_xbnz_v(is_less_than)) {
515cabdff1aSopenharmony_ci            __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_ci            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
518cabdff1aSopenharmony_ci            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
519cabdff1aSopenharmony_ci            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
520cabdff1aSopenharmony_ci            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_ci            {
523cabdff1aSopenharmony_ci                __m256i neg_thresh_h, tc_h, p0_h, q0_h;
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_ci                neg_thresh_h = __lasx_xvneg_b(tc_vec);
526cabdff1aSopenharmony_ci                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
527cabdff1aSopenharmony_ci                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_ci                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
530cabdff1aSopenharmony_ci                             neg_thresh_h, tc_h, p0_h, q0_h);
531cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
532cabdff1aSopenharmony_ci                          p0_h, q0_h);
533cabdff1aSopenharmony_ci                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
534cabdff1aSopenharmony_ci                          p0_h, q0_h);
535cabdff1aSopenharmony_ci                p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
536cabdff1aSopenharmony_ci                q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
537cabdff1aSopenharmony_ci                __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
538cabdff1aSopenharmony_ci                __lasx_xvstelm_d(q0_h, data, 0, 0);
539cabdff1aSopenharmony_ci            }
540cabdff1aSopenharmony_ci        }
541cabdff1aSopenharmony_ci    }
542cabdff1aSopenharmony_ci}
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_ci#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
545cabdff1aSopenharmony_ci                                 q3_or_p3_org_in, p1_or_q1_org_in,          \
546cabdff1aSopenharmony_ci                                 p2_or_q2_org_in, q1_or_p1_org_in,          \
547cabdff1aSopenharmony_ci                                 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
548cabdff1aSopenharmony_ci{                                                                           \
549cabdff1aSopenharmony_ci    __m256i threshold;                                                      \
550cabdff1aSopenharmony_ci    __m256i const2, const3 = __lasx_xvldi(0);                               \
551cabdff1aSopenharmony_ci                                                                            \
552cabdff1aSopenharmony_ci    const2 = __lasx_xvaddi_hu(const3, 2);                                   \
553cabdff1aSopenharmony_ci    const3 = __lasx_xvaddi_hu(const3, 3);                                   \
554cabdff1aSopenharmony_ci    threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in);           \
555cabdff1aSopenharmony_ci    threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold);                 \
556cabdff1aSopenharmony_ci                                                                            \
557cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvslli_h(threshold, 1);                           \
558cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in);           \
559cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in);           \
560cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3);                   \
561cabdff1aSopenharmony_ci                                                                            \
562cabdff1aSopenharmony_ci    p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold);              \
563cabdff1aSopenharmony_ci    p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2);                   \
564cabdff1aSopenharmony_ci                                                                            \
565cabdff1aSopenharmony_ci    p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3);                 \
566cabdff1aSopenharmony_ci    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \
567cabdff1aSopenharmony_ci    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \
568cabdff1aSopenharmony_ci    p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold);                 \
569cabdff1aSopenharmony_ci    p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3);                   \
570cabdff1aSopenharmony_ci}
571cabdff1aSopenharmony_ci
572cabdff1aSopenharmony_ci/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
573cabdff1aSopenharmony_ci#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,             \
574cabdff1aSopenharmony_ci                         p1_or_q1_org_in, p0_or_q0_out)                \
575cabdff1aSopenharmony_ci{                                                                      \
576cabdff1aSopenharmony_ci    __m256i const2 = __lasx_xvldi(0);                                  \
577cabdff1aSopenharmony_ci    const2 = __lasx_xvaddi_hu(const2, 2);                              \
578cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in);   \
579cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \
580cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \
581cabdff1aSopenharmony_ci    p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2);              \
582cabdff1aSopenharmony_ci}
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_civoid ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
585cabdff1aSopenharmony_ci                                     int alpha_in, int beta_in)
586cabdff1aSopenharmony_ci{
587cabdff1aSopenharmony_ci    ptrdiff_t img_width_2x = img_width << 1;
588cabdff1aSopenharmony_ci    ptrdiff_t img_width_4x = img_width << 2;
589cabdff1aSopenharmony_ci    ptrdiff_t img_width_3x = img_width_2x + img_width;
590cabdff1aSopenharmony_ci    uint8_t *src = data - 4;
591cabdff1aSopenharmony_ci    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
592cabdff1aSopenharmony_ci    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
593cabdff1aSopenharmony_ci    __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
594cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
595cabdff1aSopenharmony_ci
596cabdff1aSopenharmony_ci    {
597cabdff1aSopenharmony_ci        __m256i row0, row1, row2, row3, row4, row5, row6, row7;
598cabdff1aSopenharmony_ci        __m256i row8, row9, row10, row11, row12, row13, row14, row15;
599cabdff1aSopenharmony_ci
600cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
601cabdff1aSopenharmony_ci                  src, img_width_3x, row0, row1, row2, row3);
602cabdff1aSopenharmony_ci        src += img_width_4x;
603cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
604cabdff1aSopenharmony_ci                  src, img_width_3x, row4, row5, row6, row7);
605cabdff1aSopenharmony_ci        src += img_width_4x;
606cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
607cabdff1aSopenharmony_ci                  src, img_width_3x, row8, row9, row10, row11);
608cabdff1aSopenharmony_ci        src += img_width_4x;
609cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
610cabdff1aSopenharmony_ci                  src, img_width_3x, row12, row13, row14, row15);
611cabdff1aSopenharmony_ci        src += img_width_4x;
612cabdff1aSopenharmony_ci
613cabdff1aSopenharmony_ci        LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
614cabdff1aSopenharmony_ci                             row4, row5, row6, row7,
615cabdff1aSopenharmony_ci                             row8, row9, row10, row11,
616cabdff1aSopenharmony_ci                             row12, row13, row14, row15,
617cabdff1aSopenharmony_ci                             p3_org, p2_org, p1_org, p0_org,
618cabdff1aSopenharmony_ci                             q0_org, q1_org, q2_org, q3_org);
619cabdff1aSopenharmony_ci    }
620cabdff1aSopenharmony_ci
621cabdff1aSopenharmony_ci    alpha = __lasx_xvreplgr2vr_b(alpha_in);
622cabdff1aSopenharmony_ci    beta  = __lasx_xvreplgr2vr_b(beta_in);
623cabdff1aSopenharmony_ci    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
624cabdff1aSopenharmony_ci    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
625cabdff1aSopenharmony_ci    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
626cabdff1aSopenharmony_ci
627cabdff1aSopenharmony_ci    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
628cabdff1aSopenharmony_ci    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
629cabdff1aSopenharmony_ci    is_less_than       = is_less_than_beta & is_less_than_alpha;
630cabdff1aSopenharmony_ci    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
631cabdff1aSopenharmony_ci    is_less_than       = is_less_than_beta & is_less_than;
632cabdff1aSopenharmony_ci    is_less_than       = __lasx_xvpermi_q(zero, is_less_than, 0x30);
633cabdff1aSopenharmony_ci
634cabdff1aSopenharmony_ci    if (__lasx_xbnz_v(is_less_than)) {
635cabdff1aSopenharmony_ci        __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
636cabdff1aSopenharmony_ci        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
637cabdff1aSopenharmony_ci        __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
638cabdff1aSopenharmony_ci
639cabdff1aSopenharmony_ci        less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
640cabdff1aSopenharmony_ci        less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
641cabdff1aSopenharmony_ci                                                 less_alpha_shift2_add2);
642cabdff1aSopenharmony_ci
643cabdff1aSopenharmony_ci        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
644cabdff1aSopenharmony_ci        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
645cabdff1aSopenharmony_ci        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
646cabdff1aSopenharmony_ci        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
647cabdff1aSopenharmony_ci
648cabdff1aSopenharmony_ci        p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
649cabdff1aSopenharmony_ci        is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
650cabdff1aSopenharmony_ci        is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
651cabdff1aSopenharmony_ci        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
652cabdff1aSopenharmony_ci        is_less_than_beta        = is_less_than_beta & is_less_than;
653cabdff1aSopenharmony_ci        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
654cabdff1aSopenharmony_ci
655cabdff1aSopenharmony_ci        /* combine and store */
656cabdff1aSopenharmony_ci        if (__lasx_xbnz_v(is_less_than_beta)) {
657cabdff1aSopenharmony_ci            __m256i p2_org_h, p3_org_h, p1_h, p2_h;
658cabdff1aSopenharmony_ci
659cabdff1aSopenharmony_ci            p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
660cabdff1aSopenharmony_ci            p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
661cabdff1aSopenharmony_ci
662cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
663cabdff1aSopenharmony_ci                                     p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
664cabdff1aSopenharmony_ci
665cabdff1aSopenharmony_ci            p0_h = __lasx_xvpickev_b(p0_h, p0_h);
666cabdff1aSopenharmony_ci            p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
667cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
668cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
669cabdff1aSopenharmony_ci            p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
670cabdff1aSopenharmony_ci            p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
671cabdff1aSopenharmony_ci            p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
672cabdff1aSopenharmony_ci        }
673cabdff1aSopenharmony_ci
674cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
675cabdff1aSopenharmony_ci        /* combine */
676cabdff1aSopenharmony_ci        p0_h = __lasx_xvpickev_b(p0_h, p0_h);
677cabdff1aSopenharmony_ci        p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
678cabdff1aSopenharmony_ci        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
679cabdff1aSopenharmony_ci
680cabdff1aSopenharmony_ci        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
681cabdff1aSopenharmony_ci        q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
682cabdff1aSopenharmony_ci        is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
683cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
684cabdff1aSopenharmony_ci        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
685cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & is_less_than;
686cabdff1aSopenharmony_ci        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
687cabdff1aSopenharmony_ci
688cabdff1aSopenharmony_ci        /* combine and store */
689cabdff1aSopenharmony_ci        if (__lasx_xbnz_v(is_less_than_beta)) {
690cabdff1aSopenharmony_ci            __m256i q2_org_h, q3_org_h, q1_h, q2_h;
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci            q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
693cabdff1aSopenharmony_ci            q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
694cabdff1aSopenharmony_ci
695cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
696cabdff1aSopenharmony_ci                                     q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
697cabdff1aSopenharmony_ci
698cabdff1aSopenharmony_ci            q0_h = __lasx_xvpickev_b(q0_h, q0_h);
699cabdff1aSopenharmony_ci            q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
700cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
701cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
702cabdff1aSopenharmony_ci            q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
703cabdff1aSopenharmony_ci            q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
704cabdff1aSopenharmony_ci            q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
705cabdff1aSopenharmony_ci
706cabdff1aSopenharmony_ci        }
707cabdff1aSopenharmony_ci
708cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
709cabdff1aSopenharmony_ci
710cabdff1aSopenharmony_ci        /* combine */
711cabdff1aSopenharmony_ci        q0_h = __lasx_xvpickev_b(q0_h, q0_h);
712cabdff1aSopenharmony_ci        q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
713cabdff1aSopenharmony_ci        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
714cabdff1aSopenharmony_ci
715cabdff1aSopenharmony_ci        /* transpose and store */
716cabdff1aSopenharmony_ci        {
717cabdff1aSopenharmony_ci            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
718cabdff1aSopenharmony_ci            __m256i control = {0x0000000400000000, 0x0000000500000001,
719cabdff1aSopenharmony_ci                               0x0000000600000002, 0x0000000700000003};
720cabdff1aSopenharmony_ci
721cabdff1aSopenharmony_ci            DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
722cabdff1aSopenharmony_ci                      0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
723cabdff1aSopenharmony_ci                      p0_org, p1_org, p2_org, p3_org);
724cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
725cabdff1aSopenharmony_ci                      row0, row2);
726cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
727cabdff1aSopenharmony_ci                      row1, row3);
728cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
729cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
730cabdff1aSopenharmony_ci            DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
731cabdff1aSopenharmony_ci                      control, row7, control, row4, row5, row6, row7);
732cabdff1aSopenharmony_ci            src = data - 4;
733cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row4, src, 0, 0);
734cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row4, src + img_width, 0, 1);
735cabdff1aSopenharmony_ci            src += img_width_2x;
736cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row4, src, 0, 2);
737cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row4, src + img_width, 0, 3);
738cabdff1aSopenharmony_ci            src += img_width_2x;
739cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row5, src, 0, 0);
740cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row5, src + img_width, 0, 1);
741cabdff1aSopenharmony_ci            src += img_width_2x;
742cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row5, src, 0, 2);
743cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row5, src + img_width, 0, 3);
744cabdff1aSopenharmony_ci            src += img_width_2x;
745cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row6, src, 0, 0);
746cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row6, src + img_width, 0, 1);
747cabdff1aSopenharmony_ci            src += img_width_2x;
748cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row6, src, 0, 2);
749cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row6, src + img_width, 0, 3);
750cabdff1aSopenharmony_ci            src += img_width_2x;
751cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row7, src, 0, 0);
752cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row7, src + img_width, 0, 1);
753cabdff1aSopenharmony_ci            src += img_width_2x;
754cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row7, src, 0, 2);
755cabdff1aSopenharmony_ci            __lasx_xvstelm_d(row7, src + img_width, 0, 3);
756cabdff1aSopenharmony_ci        }
757cabdff1aSopenharmony_ci    }
758cabdff1aSopenharmony_ci}
759cabdff1aSopenharmony_ci
760cabdff1aSopenharmony_civoid ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
761cabdff1aSopenharmony_ci                                     int alpha_in, int beta_in)
762cabdff1aSopenharmony_ci{
763cabdff1aSopenharmony_ci    ptrdiff_t img_width_2x = img_width << 1;
764cabdff1aSopenharmony_ci    ptrdiff_t img_width_3x = img_width_2x + img_width;
765cabdff1aSopenharmony_ci    uint8_t *src = data - img_width_2x;
766cabdff1aSopenharmony_ci    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
767cabdff1aSopenharmony_ci    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
768cabdff1aSopenharmony_ci    __m256i p1_org, p0_org, q0_org, q1_org;
769cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
770cabdff1aSopenharmony_ci
771cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
772cabdff1aSopenharmony_ci              src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
773cabdff1aSopenharmony_ci    alpha = __lasx_xvreplgr2vr_b(alpha_in);
774cabdff1aSopenharmony_ci    beta  = __lasx_xvreplgr2vr_b(beta_in);
775cabdff1aSopenharmony_ci    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
776cabdff1aSopenharmony_ci    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
777cabdff1aSopenharmony_ci    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
778cabdff1aSopenharmony_ci
779cabdff1aSopenharmony_ci    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
780cabdff1aSopenharmony_ci    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
781cabdff1aSopenharmony_ci    is_less_than       = is_less_than_beta & is_less_than_alpha;
782cabdff1aSopenharmony_ci    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
783cabdff1aSopenharmony_ci    is_less_than       = is_less_than_beta & is_less_than;
784cabdff1aSopenharmony_ci    is_less_than       = __lasx_xvpermi_q(zero, is_less_than, 0x30);
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci    if (__lasx_xbnz_v(is_less_than)) {
787cabdff1aSopenharmony_ci        __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
788cabdff1aSopenharmony_ci        __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
789cabdff1aSopenharmony_ci        __m256i p2_org = __lasx_xvldx(src, -img_width);
790cabdff1aSopenharmony_ci        __m256i q2_org = __lasx_xvldx(data, img_width_2x);
791cabdff1aSopenharmony_ci        __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
792cabdff1aSopenharmony_ci        less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
793cabdff1aSopenharmony_ci        less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
794cabdff1aSopenharmony_ci                                                 less_alpha_shift2_add2);
795cabdff1aSopenharmony_ci
796cabdff1aSopenharmony_ci        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
797cabdff1aSopenharmony_ci        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
798cabdff1aSopenharmony_ci        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
799cabdff1aSopenharmony_ci        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
800cabdff1aSopenharmony_ci
801cabdff1aSopenharmony_ci        p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
802cabdff1aSopenharmony_ci        is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
803cabdff1aSopenharmony_ci        is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
804cabdff1aSopenharmony_ci        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
805cabdff1aSopenharmony_ci        is_less_than_beta        = is_less_than_beta & is_less_than;
806cabdff1aSopenharmony_ci        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
807cabdff1aSopenharmony_ci
808cabdff1aSopenharmony_ci        /* combine and store */
809cabdff1aSopenharmony_ci        if (__lasx_xbnz_v(is_less_than_beta)) {
810cabdff1aSopenharmony_ci            __m256i p2_org_h, p3_org_h, p1_h, p2_h;
811cabdff1aSopenharmony_ci            __m256i p3_org = __lasx_xvldx(src, -img_width_2x);
812cabdff1aSopenharmony_ci
813cabdff1aSopenharmony_ci            p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
814cabdff1aSopenharmony_ci            p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
815cabdff1aSopenharmony_ci
816cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
817cabdff1aSopenharmony_ci                                     p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
818cabdff1aSopenharmony_ci
819cabdff1aSopenharmony_ci            p0_h = __lasx_xvpickev_b(p0_h, p0_h);
820cabdff1aSopenharmony_ci            p0_h =  __lasx_xvpermi_d(p0_h, 0xd8);
821cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
822cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
823cabdff1aSopenharmony_ci            p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
824cabdff1aSopenharmony_ci            p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
825cabdff1aSopenharmony_ci            p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
826cabdff1aSopenharmony_ci
827cabdff1aSopenharmony_ci            __lasx_xvst(p1_org, src, 0);
828cabdff1aSopenharmony_ci            __lasx_xvst(p2_org, src - img_width, 0);
829cabdff1aSopenharmony_ci        }
830cabdff1aSopenharmony_ci
831cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
832cabdff1aSopenharmony_ci        /* combine */
833cabdff1aSopenharmony_ci        p0_h = __lasx_xvpickev_b(p0_h, p0_h);
834cabdff1aSopenharmony_ci        p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
835cabdff1aSopenharmony_ci        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
836cabdff1aSopenharmony_ci        __lasx_xvst(p0_org, data - img_width, 0);
837cabdff1aSopenharmony_ci
838cabdff1aSopenharmony_ci        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
839cabdff1aSopenharmony_ci        q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
840cabdff1aSopenharmony_ci        is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
841cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
842cabdff1aSopenharmony_ci        negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
843cabdff1aSopenharmony_ci        is_less_than_beta = is_less_than_beta & is_less_than;
844cabdff1aSopenharmony_ci        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
845cabdff1aSopenharmony_ci
846cabdff1aSopenharmony_ci        /* combine and store */
847cabdff1aSopenharmony_ci        if (__lasx_xbnz_v(is_less_than_beta)) {
848cabdff1aSopenharmony_ci            __m256i q2_org_h, q3_org_h, q1_h, q2_h;
849cabdff1aSopenharmony_ci            __m256i q3_org = __lasx_xvldx(data, img_width_2x + img_width);
850cabdff1aSopenharmony_ci
851cabdff1aSopenharmony_ci            q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
852cabdff1aSopenharmony_ci            q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
853cabdff1aSopenharmony_ci
854cabdff1aSopenharmony_ci            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
855cabdff1aSopenharmony_ci                                     q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
856cabdff1aSopenharmony_ci
857cabdff1aSopenharmony_ci            q0_h = __lasx_xvpickev_b(q0_h, q0_h);
858cabdff1aSopenharmony_ci            q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
859cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
860cabdff1aSopenharmony_ci            DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
861cabdff1aSopenharmony_ci            q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
862cabdff1aSopenharmony_ci            q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
863cabdff1aSopenharmony_ci            q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
864cabdff1aSopenharmony_ci
865cabdff1aSopenharmony_ci            __lasx_xvst(q1_org, data + img_width, 0);
866cabdff1aSopenharmony_ci            __lasx_xvst(q2_org, data + img_width_2x, 0);
867cabdff1aSopenharmony_ci        }
868cabdff1aSopenharmony_ci
869cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
870cabdff1aSopenharmony_ci
871cabdff1aSopenharmony_ci        /* combine */
872cabdff1aSopenharmony_ci        q0_h = __lasx_xvpickev_b(q0_h, q0_h);
873cabdff1aSopenharmony_ci        q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
874cabdff1aSopenharmony_ci        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
875cabdff1aSopenharmony_ci
876cabdff1aSopenharmony_ci        __lasx_xvst(q0_org, data, 0);
877cabdff1aSopenharmony_ci    }
878cabdff1aSopenharmony_ci}
879cabdff1aSopenharmony_ci
880cabdff1aSopenharmony_civoid ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
881cabdff1aSopenharmony_ci                                       int alpha_in, int beta_in)
882cabdff1aSopenharmony_ci{
883cabdff1aSopenharmony_ci    uint8_t *src = data - 2;
884cabdff1aSopenharmony_ci    ptrdiff_t img_width_2x = img_width << 1;
885cabdff1aSopenharmony_ci    ptrdiff_t img_width_4x = img_width << 2;
886cabdff1aSopenharmony_ci    ptrdiff_t img_width_3x = img_width_2x + img_width;
887cabdff1aSopenharmony_ci    __m256i p1_org, p0_org, q0_org, q1_org;
888cabdff1aSopenharmony_ci    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
889cabdff1aSopenharmony_ci    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
890cabdff1aSopenharmony_ci
891cabdff1aSopenharmony_ci    {
892cabdff1aSopenharmony_ci        __m256i row0, row1, row2, row3, row4, row5, row6, row7;
893cabdff1aSopenharmony_ci
894cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
895cabdff1aSopenharmony_ci                  img_width_3x, row0, row1, row2, row3);
896cabdff1aSopenharmony_ci        src += img_width_4x;
897cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
898cabdff1aSopenharmony_ci                  img_width_3x, row4, row5, row6, row7);
899cabdff1aSopenharmony_ci
900cabdff1aSopenharmony_ci        /* LASX_TRANSPOSE8x4_B */
901cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
902cabdff1aSopenharmony_ci                  p1_org, p0_org, q0_org, q1_org);
903cabdff1aSopenharmony_ci        row0 = __lasx_xvilvl_b(p0_org, p1_org);
904cabdff1aSopenharmony_ci        row1 = __lasx_xvilvl_b(q1_org, q0_org);
905cabdff1aSopenharmony_ci        row3 = __lasx_xvilvh_w(row1, row0);
906cabdff1aSopenharmony_ci        row2 = __lasx_xvilvl_w(row1, row0);
907cabdff1aSopenharmony_ci        p1_org = __lasx_xvpermi_d(row2, 0x00);
908cabdff1aSopenharmony_ci        p0_org = __lasx_xvpermi_d(row2, 0x55);
909cabdff1aSopenharmony_ci        q0_org = __lasx_xvpermi_d(row3, 0x00);
910cabdff1aSopenharmony_ci        q1_org = __lasx_xvpermi_d(row3, 0x55);
911cabdff1aSopenharmony_ci    }
912cabdff1aSopenharmony_ci
913cabdff1aSopenharmony_ci    alpha = __lasx_xvreplgr2vr_b(alpha_in);
914cabdff1aSopenharmony_ci    beta  = __lasx_xvreplgr2vr_b(beta_in);
915cabdff1aSopenharmony_ci
916cabdff1aSopenharmony_ci    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
917cabdff1aSopenharmony_ci    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
918cabdff1aSopenharmony_ci    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
919cabdff1aSopenharmony_ci
920cabdff1aSopenharmony_ci    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
921cabdff1aSopenharmony_ci    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
922cabdff1aSopenharmony_ci    is_less_than       = is_less_than_alpha & is_less_than_beta;
923cabdff1aSopenharmony_ci    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
924cabdff1aSopenharmony_ci    is_less_than       = is_less_than_beta & is_less_than;
925cabdff1aSopenharmony_ci
926cabdff1aSopenharmony_ci    if (__lasx_xbnz_v(is_less_than)) {
927cabdff1aSopenharmony_ci        __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
928cabdff1aSopenharmony_ci
929cabdff1aSopenharmony_ci        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
930cabdff1aSopenharmony_ci        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
931cabdff1aSopenharmony_ci        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
932cabdff1aSopenharmony_ci        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
933cabdff1aSopenharmony_ci
934cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
935cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
936cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
937cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
938cabdff1aSopenharmony_ci        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
939cabdff1aSopenharmony_ci        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
940cabdff1aSopenharmony_ci    }
941cabdff1aSopenharmony_ci    p0_org = __lasx_xvilvl_b(q0_org, p0_org);
942cabdff1aSopenharmony_ci    src = data - 1;
943cabdff1aSopenharmony_ci    __lasx_xvstelm_h(p0_org, src, 0, 0);
944cabdff1aSopenharmony_ci    src += img_width;
945cabdff1aSopenharmony_ci    __lasx_xvstelm_h(p0_org, src, 0, 1);
946cabdff1aSopenharmony_ci    src += img_width;
947cabdff1aSopenharmony_ci    __lasx_xvstelm_h(p0_org, src, 0, 2);
948cabdff1aSopenharmony_ci    src += img_width;
949cabdff1aSopenharmony_ci    __lasx_xvstelm_h(p0_org, src, 0, 3);
950cabdff1aSopenharmony_ci    src += img_width;
951cabdff1aSopenharmony_ci    __lasx_xvstelm_h(p0_org, src, 0, 4);
952cabdff1aSopenharmony_ci    src += img_width;
953cabdff1aSopenharmony_ci    __lasx_xvstelm_h(p0_org, src, 0, 5);
954cabdff1aSopenharmony_ci    src += img_width;
955cabdff1aSopenharmony_ci    __lasx_xvstelm_h(p0_org, src, 0, 6);
956cabdff1aSopenharmony_ci    src += img_width;
957cabdff1aSopenharmony_ci    __lasx_xvstelm_h(p0_org, src, 0, 7);
958cabdff1aSopenharmony_ci}
959cabdff1aSopenharmony_ci
960cabdff1aSopenharmony_civoid ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
961cabdff1aSopenharmony_ci                                       int alpha_in, int beta_in)
962cabdff1aSopenharmony_ci{
963cabdff1aSopenharmony_ci    ptrdiff_t img_width_2x = img_width << 1;
964cabdff1aSopenharmony_ci    __m256i p1_org, p0_org, q0_org, q1_org;
965cabdff1aSopenharmony_ci    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
966cabdff1aSopenharmony_ci    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
967cabdff1aSopenharmony_ci
968cabdff1aSopenharmony_ci    alpha = __lasx_xvreplgr2vr_b(alpha_in);
969cabdff1aSopenharmony_ci    beta  = __lasx_xvreplgr2vr_b(beta_in);
970cabdff1aSopenharmony_ci
971cabdff1aSopenharmony_ci    p1_org = __lasx_xvldx(data, -img_width_2x);
972cabdff1aSopenharmony_ci    p0_org = __lasx_xvldx(data, -img_width);
973cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
974cabdff1aSopenharmony_ci
975cabdff1aSopenharmony_ci    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
976cabdff1aSopenharmony_ci    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
977cabdff1aSopenharmony_ci    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
978cabdff1aSopenharmony_ci
979cabdff1aSopenharmony_ci    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
980cabdff1aSopenharmony_ci    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
981cabdff1aSopenharmony_ci    is_less_than       = is_less_than_alpha & is_less_than_beta;
982cabdff1aSopenharmony_ci    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
983cabdff1aSopenharmony_ci    is_less_than       = is_less_than_beta & is_less_than;
984cabdff1aSopenharmony_ci
985cabdff1aSopenharmony_ci    if (__lasx_xbnz_v(is_less_than)) {
986cabdff1aSopenharmony_ci        __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
987cabdff1aSopenharmony_ci
988cabdff1aSopenharmony_ci        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
989cabdff1aSopenharmony_ci        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
990cabdff1aSopenharmony_ci        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
991cabdff1aSopenharmony_ci        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
992cabdff1aSopenharmony_ci
993cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
994cabdff1aSopenharmony_ci        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
995cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
996cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
997cabdff1aSopenharmony_ci        p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
998cabdff1aSopenharmony_ci        q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
999cabdff1aSopenharmony_ci        __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
1000cabdff1aSopenharmony_ci        __lasx_xvstelm_d(q0_h, data, 0, 0);
1001cabdff1aSopenharmony_ci    }
1002cabdff1aSopenharmony_ci}
1003cabdff1aSopenharmony_ci
1004cabdff1aSopenharmony_civoid ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
1005cabdff1aSopenharmony_ci                                      ptrdiff_t stride, int height,
1006cabdff1aSopenharmony_ci                                      int log2_denom, int weight_dst,
1007cabdff1aSopenharmony_ci                                      int weight_src, int offset_in)
1008cabdff1aSopenharmony_ci{
1009cabdff1aSopenharmony_ci    __m256i wgt;
1010cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
1011cabdff1aSopenharmony_ci    __m256i dst0, dst1, dst2, dst3;
1012cabdff1aSopenharmony_ci    __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1013cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1014cabdff1aSopenharmony_ci    __m256i denom, offset;
1015cabdff1aSopenharmony_ci    int stride_2x = stride << 1;
1016cabdff1aSopenharmony_ci    int stride_4x = stride << 2;
1017cabdff1aSopenharmony_ci    int stride_3x = stride_2x + stride;
1018cabdff1aSopenharmony_ci
1019cabdff1aSopenharmony_ci    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1020cabdff1aSopenharmony_ci    offset_in  += ((weight_src + weight_dst) << 7);
1021cabdff1aSopenharmony_ci    log2_denom += 1;
1022cabdff1aSopenharmony_ci
1023cabdff1aSopenharmony_ci    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1024cabdff1aSopenharmony_ci    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1025cabdff1aSopenharmony_ci    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1026cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_in);
1027cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1028cabdff1aSopenharmony_ci
1029cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1030cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1031cabdff1aSopenharmony_ci    src += stride_4x;
1032cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1033cabdff1aSopenharmony_ci              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1034cabdff1aSopenharmony_ci    src += stride_4x;
1035cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1036cabdff1aSopenharmony_ci              0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1037cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1038cabdff1aSopenharmony_ci              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1039cabdff1aSopenharmony_ci    dst += stride_4x;
1040cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1041cabdff1aSopenharmony_ci              dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1042cabdff1aSopenharmony_ci    dst -= stride_4x;
1043cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1044cabdff1aSopenharmony_ci              0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1045cabdff1aSopenharmony_ci
1046cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1047cabdff1aSopenharmony_ci              src0, src1, src2, src3);
1048cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1049cabdff1aSopenharmony_ci              dst0, dst1, dst2, dst3);
1050cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1051cabdff1aSopenharmony_ci              dst3, src3, vec0, vec2, vec4, vec6);
1052cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1053cabdff1aSopenharmony_ci              dst3, src3, vec1, vec3, vec5, vec7);
1054cabdff1aSopenharmony_ci
1055cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1056cabdff1aSopenharmony_ci              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1057cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1058cabdff1aSopenharmony_ci              offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1059cabdff1aSopenharmony_ci
1060cabdff1aSopenharmony_ci    tmp0 = __lasx_xvsra_h(tmp0, denom);
1061cabdff1aSopenharmony_ci    tmp1 = __lasx_xvsra_h(tmp1, denom);
1062cabdff1aSopenharmony_ci    tmp2 = __lasx_xvsra_h(tmp2, denom);
1063cabdff1aSopenharmony_ci    tmp3 = __lasx_xvsra_h(tmp3, denom);
1064cabdff1aSopenharmony_ci    tmp4 = __lasx_xvsra_h(tmp4, denom);
1065cabdff1aSopenharmony_ci    tmp5 = __lasx_xvsra_h(tmp5, denom);
1066cabdff1aSopenharmony_ci    tmp6 = __lasx_xvsra_h(tmp6, denom);
1067cabdff1aSopenharmony_ci    tmp7 = __lasx_xvsra_h(tmp7, denom);
1068cabdff1aSopenharmony_ci
1069cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1070cabdff1aSopenharmony_ci                                  tmp0, tmp1, tmp2, tmp3);
1071cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1072cabdff1aSopenharmony_ci                                  tmp4, tmp5, tmp6, tmp7);
1073cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1074cabdff1aSopenharmony_ci              dst0, dst1, dst2, dst3);
1075cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
1076cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 8, 1);
1077cabdff1aSopenharmony_ci    dst += stride;
1078cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 2);
1079cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 8, 3);
1080cabdff1aSopenharmony_ci    dst += stride;
1081cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst, 0, 0);
1082cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst, 8, 1);
1083cabdff1aSopenharmony_ci    dst += stride;
1084cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst, 0, 2);
1085cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst, 8, 3);
1086cabdff1aSopenharmony_ci    dst += stride;
1087cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst, 0, 0);
1088cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst, 8, 1);
1089cabdff1aSopenharmony_ci    dst += stride;
1090cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst, 0, 2);
1091cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst, 8, 3);
1092cabdff1aSopenharmony_ci    dst += stride;
1093cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst, 0, 0);
1094cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst, 8, 1);
1095cabdff1aSopenharmony_ci    dst += stride;
1096cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst, 0, 2);
1097cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst, 8, 3);
1098cabdff1aSopenharmony_ci    dst += stride;
1099cabdff1aSopenharmony_ci
1100cabdff1aSopenharmony_ci    if (16 == height) {
1101cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1102cabdff1aSopenharmony_ci                  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1103cabdff1aSopenharmony_ci        src += stride_4x;
1104cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1105cabdff1aSopenharmony_ci                  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1106cabdff1aSopenharmony_ci        src += stride_4x;
1107cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1108cabdff1aSopenharmony_ci                  tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1109cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1110cabdff1aSopenharmony_ci                  dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1111cabdff1aSopenharmony_ci        dst += stride_4x;
1112cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1113cabdff1aSopenharmony_ci                  dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1114cabdff1aSopenharmony_ci        dst -= stride_4x;
1115cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1116cabdff1aSopenharmony_ci                  tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1117cabdff1aSopenharmony_ci
1118cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1119cabdff1aSopenharmony_ci                  src0, src1, src2, src3);
1120cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1121cabdff1aSopenharmony_ci                  dst0, dst1, dst2, dst3);
1122cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1123cabdff1aSopenharmony_ci                  dst3, src3, vec0, vec2, vec4, vec6);
1124cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1125cabdff1aSopenharmony_ci                  dst3, src3, vec1, vec3, vec5, vec7);
1126cabdff1aSopenharmony_ci
1127cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1128cabdff1aSopenharmony_ci                  offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1129cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1130cabdff1aSopenharmony_ci                  offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1131cabdff1aSopenharmony_ci
1132cabdff1aSopenharmony_ci        tmp0 = __lasx_xvsra_h(tmp0, denom);
1133cabdff1aSopenharmony_ci        tmp1 = __lasx_xvsra_h(tmp1, denom);
1134cabdff1aSopenharmony_ci        tmp2 = __lasx_xvsra_h(tmp2, denom);
1135cabdff1aSopenharmony_ci        tmp3 = __lasx_xvsra_h(tmp3, denom);
1136cabdff1aSopenharmony_ci        tmp4 = __lasx_xvsra_h(tmp4, denom);
1137cabdff1aSopenharmony_ci        tmp5 = __lasx_xvsra_h(tmp5, denom);
1138cabdff1aSopenharmony_ci        tmp6 = __lasx_xvsra_h(tmp6, denom);
1139cabdff1aSopenharmony_ci        tmp7 = __lasx_xvsra_h(tmp7, denom);
1140cabdff1aSopenharmony_ci
1141cabdff1aSopenharmony_ci        DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1142cabdff1aSopenharmony_ci                                      tmp0, tmp1, tmp2, tmp3);
1143cabdff1aSopenharmony_ci        DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1144cabdff1aSopenharmony_ci                                      tmp4, tmp5, tmp6, tmp7);
1145cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
1146cabdff1aSopenharmony_ci                  tmp6, dst0, dst1, dst2, dst3);
1147cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst0, dst, 0, 0);
1148cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst0, dst, 8, 1);
1149cabdff1aSopenharmony_ci        dst += stride;
1150cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst0, dst, 0, 2);
1151cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst0, dst, 8, 3);
1152cabdff1aSopenharmony_ci        dst += stride;
1153cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst1, dst, 0, 0);
1154cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst1, dst, 8, 1);
1155cabdff1aSopenharmony_ci        dst += stride;
1156cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst1, dst, 0, 2);
1157cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst1, dst, 8, 3);
1158cabdff1aSopenharmony_ci        dst += stride;
1159cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst2, dst, 0, 0);
1160cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst2, dst, 8, 1);
1161cabdff1aSopenharmony_ci        dst += stride;
1162cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst2, dst, 0, 2);
1163cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst2, dst, 8, 3);
1164cabdff1aSopenharmony_ci        dst += stride;
1165cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst3, dst, 0, 0);
1166cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst3, dst, 8, 1);
1167cabdff1aSopenharmony_ci        dst += stride;
1168cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst3, dst, 0, 2);
1169cabdff1aSopenharmony_ci        __lasx_xvstelm_d(dst3, dst, 8, 3);
1170cabdff1aSopenharmony_ci    }
1171cabdff1aSopenharmony_ci}
1172cabdff1aSopenharmony_ci
1173cabdff1aSopenharmony_cistatic void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1174cabdff1aSopenharmony_ci                               int32_t log2_denom, int32_t weight_src,
1175cabdff1aSopenharmony_ci                               int32_t weight_dst, int32_t offset_in)
1176cabdff1aSopenharmony_ci{
1177cabdff1aSopenharmony_ci    __m256i wgt, vec0, vec1;
1178cabdff1aSopenharmony_ci    __m256i src0, dst0;
1179cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1180cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1181cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1182cabdff1aSopenharmony_ci
1183cabdff1aSopenharmony_ci    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1184cabdff1aSopenharmony_ci    offset_in  += ((weight_src + weight_dst) << 7);
1185cabdff1aSopenharmony_ci    log2_denom += 1;
1186cabdff1aSopenharmony_ci
1187cabdff1aSopenharmony_ci    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1188cabdff1aSopenharmony_ci    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1189cabdff1aSopenharmony_ci    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1190cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_in);
1191cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1192cabdff1aSopenharmony_ci
1193cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1194cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1195cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1196cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1197cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1198cabdff1aSopenharmony_ci              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1199cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1200cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1201cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1202cabdff1aSopenharmony_ci    vec0 = __lasx_xvilvl_b(dst0, src0);
1203cabdff1aSopenharmony_ci    vec1 = __lasx_xvilvh_b(dst0, src0);
1204cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1205cabdff1aSopenharmony_ci              tmp0, tmp1);
1206cabdff1aSopenharmony_ci    tmp0 = __lasx_xvsra_h(tmp0, denom);
1207cabdff1aSopenharmony_ci    tmp1 = __lasx_xvsra_h(tmp1, denom);
1208cabdff1aSopenharmony_ci    DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1209cabdff1aSopenharmony_ci    dst0 = __lasx_xvpickev_b(tmp1, tmp0);
1210cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
1211cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1212cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1213cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1214cabdff1aSopenharmony_ci}
1215cabdff1aSopenharmony_ci
1216cabdff1aSopenharmony_cistatic void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1217cabdff1aSopenharmony_ci                               int32_t log2_denom, int32_t weight_src,
1218cabdff1aSopenharmony_ci                               int32_t weight_dst, int32_t offset_in)
1219cabdff1aSopenharmony_ci{
1220cabdff1aSopenharmony_ci    __m256i wgt, vec0, vec1, vec2, vec3;
1221cabdff1aSopenharmony_ci    __m256i src0, src1, dst0, dst1;
1222cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1223cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1224cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1225cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1226cabdff1aSopenharmony_ci    uint8_t* dst_tmp = dst;
1227cabdff1aSopenharmony_ci
1228cabdff1aSopenharmony_ci    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1229cabdff1aSopenharmony_ci    offset_in  += ((weight_src + weight_dst) << 7);
1230cabdff1aSopenharmony_ci    log2_denom += 1;
1231cabdff1aSopenharmony_ci
1232cabdff1aSopenharmony_ci    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1233cabdff1aSopenharmony_ci    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1234cabdff1aSopenharmony_ci    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1235cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_in);
1236cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1237cabdff1aSopenharmony_ci
1238cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1239cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1240cabdff1aSopenharmony_ci    src += stride_4x;
1241cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1242cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1243cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1244cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1245cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1246cabdff1aSopenharmony_ci    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1247cabdff1aSopenharmony_ci    tmp0 = __lasx_xvld(dst_tmp, 0);
1248cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2);
1249cabdff1aSopenharmony_ci    tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
1250cabdff1aSopenharmony_ci    dst_tmp += stride_4x;
1251cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1252cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1253cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1254cabdff1aSopenharmony_ci              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1255cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1256cabdff1aSopenharmony_ci    dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1257cabdff1aSopenharmony_ci
1258cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128,
1259cabdff1aSopenharmony_ci              src0, src1, dst0, dst1);
1260cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2);
1261cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3);
1262cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1263cabdff1aSopenharmony_ci              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1264cabdff1aSopenharmony_ci    tmp0 = __lasx_xvsra_h(tmp0, denom);
1265cabdff1aSopenharmony_ci    tmp1 = __lasx_xvsra_h(tmp1, denom);
1266cabdff1aSopenharmony_ci    tmp2 = __lasx_xvsra_h(tmp2, denom);
1267cabdff1aSopenharmony_ci    tmp3 = __lasx_xvsra_h(tmp3, denom);
1268cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1269cabdff1aSopenharmony_ci                                  tmp0, tmp1, tmp2, tmp3);
1270cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
1271cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
1272cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1273cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1274cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1275cabdff1aSopenharmony_ci    dst += stride_4x;
1276cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst, 0, 0);
1277cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1278cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1279cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1280cabdff1aSopenharmony_ci}
1281cabdff1aSopenharmony_ci
1282cabdff1aSopenharmony_cistatic void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1283cabdff1aSopenharmony_ci                                int32_t log2_denom, int32_t weight_src,
1284cabdff1aSopenharmony_ci                                int32_t weight_dst, int32_t offset_in)
1285cabdff1aSopenharmony_ci{
1286cabdff1aSopenharmony_ci    __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1287cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1288cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1289cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1290cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1291cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1292cabdff1aSopenharmony_ci    uint8_t* dst_tmp = dst;
1293cabdff1aSopenharmony_ci
1294cabdff1aSopenharmony_ci    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1295cabdff1aSopenharmony_ci    offset_in  += ((weight_src + weight_dst) << 7);
1296cabdff1aSopenharmony_ci    log2_denom += 1;
1297cabdff1aSopenharmony_ci
1298cabdff1aSopenharmony_ci    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1299cabdff1aSopenharmony_ci    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1300cabdff1aSopenharmony_ci    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1301cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_in);
1302cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1303cabdff1aSopenharmony_ci
1304cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1305cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1306cabdff1aSopenharmony_ci    src += stride_4x;
1307cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1308cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1309cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1310cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1311cabdff1aSopenharmony_ci    src += stride_4x;
1312cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1313cabdff1aSopenharmony_ci    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1314cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1315cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1316cabdff1aSopenharmony_ci    src += stride_4x;
1317cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1318cabdff1aSopenharmony_ci    src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1319cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1320cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1321cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1322cabdff1aSopenharmony_ci    src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1323cabdff1aSopenharmony_ci
1324cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1325cabdff1aSopenharmony_ci              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1326cabdff1aSopenharmony_ci    dst_tmp += stride_4x;
1327cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1328cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1329cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1330cabdff1aSopenharmony_ci              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1331cabdff1aSopenharmony_ci    dst_tmp += stride_4x;
1332cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1333cabdff1aSopenharmony_ci    dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1334cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1335cabdff1aSopenharmony_ci              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1336cabdff1aSopenharmony_ci    dst_tmp += stride_4x;
1337cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1338cabdff1aSopenharmony_ci    dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1339cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1340cabdff1aSopenharmony_ci              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1341cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1342cabdff1aSopenharmony_ci    dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1343cabdff1aSopenharmony_ci
1344cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1345cabdff1aSopenharmony_ci              src0, src1, src2, src3);
1346cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1347cabdff1aSopenharmony_ci              dst0, dst1, dst2, dst3);
1348cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1349cabdff1aSopenharmony_ci              dst3, src3, vec0, vec2, vec4, vec6);
1350cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1351cabdff1aSopenharmony_ci              dst3, src3, vec1, vec3, vec5, vec7);
1352cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1353cabdff1aSopenharmony_ci              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1354cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5,
1355cabdff1aSopenharmony_ci              offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1356cabdff1aSopenharmony_ci    tmp0 = __lasx_xvsra_h(tmp0, denom);
1357cabdff1aSopenharmony_ci    tmp1 = __lasx_xvsra_h(tmp1, denom);
1358cabdff1aSopenharmony_ci    tmp2 = __lasx_xvsra_h(tmp2, denom);
1359cabdff1aSopenharmony_ci    tmp3 = __lasx_xvsra_h(tmp3, denom);
1360cabdff1aSopenharmony_ci    tmp4 = __lasx_xvsra_h(tmp4, denom);
1361cabdff1aSopenharmony_ci    tmp5 = __lasx_xvsra_h(tmp5, denom);
1362cabdff1aSopenharmony_ci    tmp6 = __lasx_xvsra_h(tmp6, denom);
1363cabdff1aSopenharmony_ci    tmp7 = __lasx_xvsra_h(tmp7, denom);
1364cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1365cabdff1aSopenharmony_ci                                  tmp0, tmp1, tmp2, tmp3);
1366cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1367cabdff1aSopenharmony_ci                                  tmp4, tmp5, tmp6, tmp7);
1368cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1369cabdff1aSopenharmony_ci                   dst0, dst1, dst2, dst3)
1370cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
1371cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1372cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1373cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1374cabdff1aSopenharmony_ci    dst += stride_4x;
1375cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst, 0, 0);
1376cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1377cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1378cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1379cabdff1aSopenharmony_ci    dst += stride_4x;
1380cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst, 0, 0);
1381cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst + stride, 0, 1);
1382cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
1383cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
1384cabdff1aSopenharmony_ci    dst += stride_4x;
1385cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst, 0, 0);
1386cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst + stride, 0, 1);
1387cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
1388cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
1389cabdff1aSopenharmony_ci}
1390cabdff1aSopenharmony_ci
1391cabdff1aSopenharmony_civoid ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
1392cabdff1aSopenharmony_ci                                     ptrdiff_t stride, int height,
1393cabdff1aSopenharmony_ci                                     int log2_denom, int weight_dst,
1394cabdff1aSopenharmony_ci                                     int weight_src, int offset)
1395cabdff1aSopenharmony_ci{
1396cabdff1aSopenharmony_ci    if (4 == height) {
1397cabdff1aSopenharmony_ci        avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1398cabdff1aSopenharmony_ci                           offset);
1399cabdff1aSopenharmony_ci    } else if (8 == height) {
1400cabdff1aSopenharmony_ci        avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1401cabdff1aSopenharmony_ci                           offset);
1402cabdff1aSopenharmony_ci    } else {
1403cabdff1aSopenharmony_ci        avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1404cabdff1aSopenharmony_ci                            offset);
1405cabdff1aSopenharmony_ci    }
1406cabdff1aSopenharmony_ci}
1407cabdff1aSopenharmony_ci
1408cabdff1aSopenharmony_cistatic void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1409cabdff1aSopenharmony_ci                               int32_t log2_denom, int32_t weight_src,
1410cabdff1aSopenharmony_ci                               int32_t weight_dst, int32_t offset_in)
1411cabdff1aSopenharmony_ci{
1412cabdff1aSopenharmony_ci    __m256i wgt, vec0;
1413cabdff1aSopenharmony_ci    __m256i src0, dst0;
1414cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, denom, offset;
1415cabdff1aSopenharmony_ci
1416cabdff1aSopenharmony_ci    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1417cabdff1aSopenharmony_ci    offset_in  += ((weight_src + weight_dst) << 7);
1418cabdff1aSopenharmony_ci    log2_denom += 1;
1419cabdff1aSopenharmony_ci
1420cabdff1aSopenharmony_ci    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1421cabdff1aSopenharmony_ci    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1422cabdff1aSopenharmony_ci    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1423cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_in);
1424cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1425cabdff1aSopenharmony_ci
1426cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1427cabdff1aSopenharmony_ci    src0 = __lasx_xvilvl_w(tmp1, tmp0);
1428cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1);
1429cabdff1aSopenharmony_ci    dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1430cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1431cabdff1aSopenharmony_ci    vec0 = __lasx_xvilvl_b(dst0, src0);
1432cabdff1aSopenharmony_ci    tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1433cabdff1aSopenharmony_ci    tmp0 = __lasx_xvsra_h(tmp0, denom);
1434cabdff1aSopenharmony_ci    tmp0 = __lasx_xvclip255_h(tmp0);
1435cabdff1aSopenharmony_ci    tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1436cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst, 0, 0);
1437cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1438cabdff1aSopenharmony_ci}
1439cabdff1aSopenharmony_ci
1440cabdff1aSopenharmony_cistatic void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1441cabdff1aSopenharmony_ci                               int32_t log2_denom, int32_t weight_src,
1442cabdff1aSopenharmony_ci                               int32_t weight_dst, int32_t offset_in)
1443cabdff1aSopenharmony_ci{
1444cabdff1aSopenharmony_ci    __m256i wgt, vec0;
1445cabdff1aSopenharmony_ci    __m256i src0, dst0;
1446cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1447cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1448cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1449cabdff1aSopenharmony_ci
1450cabdff1aSopenharmony_ci    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1451cabdff1aSopenharmony_ci    offset_in  += ((weight_src + weight_dst) << 7);
1452cabdff1aSopenharmony_ci    log2_denom += 1;
1453cabdff1aSopenharmony_ci
1454cabdff1aSopenharmony_ci    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1455cabdff1aSopenharmony_ci    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1456cabdff1aSopenharmony_ci    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1457cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_in);
1458cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1459cabdff1aSopenharmony_ci
1460cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1461cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1462cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1463cabdff1aSopenharmony_ci    src0 = __lasx_xvilvl_w(tmp1, tmp0);
1464cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1465cabdff1aSopenharmony_ci              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1466cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1467cabdff1aSopenharmony_ci    dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1468cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1469cabdff1aSopenharmony_ci    vec0 = __lasx_xvilvl_b(dst0, src0);
1470cabdff1aSopenharmony_ci    dst0 = __lasx_xvilvh_b(dst0, src0);
1471cabdff1aSopenharmony_ci    vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
1472cabdff1aSopenharmony_ci    tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1473cabdff1aSopenharmony_ci    tmp0 = __lasx_xvsra_h(tmp0, denom);
1474cabdff1aSopenharmony_ci    tmp0 = __lasx_xvclip255_h(tmp0);
1475cabdff1aSopenharmony_ci    tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1476cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst, 0, 0);
1477cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1478cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
1479cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
1480cabdff1aSopenharmony_ci}
1481cabdff1aSopenharmony_ci
1482cabdff1aSopenharmony_cistatic void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1483cabdff1aSopenharmony_ci                               int32_t log2_denom, int32_t weight_src,
1484cabdff1aSopenharmony_ci                               int32_t weight_dst, int32_t offset_in)
1485cabdff1aSopenharmony_ci{
1486cabdff1aSopenharmony_ci    __m256i wgt, vec0, vec1;
1487cabdff1aSopenharmony_ci    __m256i src0, dst0;
1488cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1489cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1490cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1491cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1492cabdff1aSopenharmony_ci
1493cabdff1aSopenharmony_ci    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1494cabdff1aSopenharmony_ci    offset_in  += ((weight_src + weight_dst) << 7);
1495cabdff1aSopenharmony_ci    log2_denom += 1;
1496cabdff1aSopenharmony_ci
1497cabdff1aSopenharmony_ci    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
1498cabdff1aSopenharmony_ci    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
1499cabdff1aSopenharmony_ci    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
1500cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_in);
1501cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1502cabdff1aSopenharmony_ci
1503cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1504cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1505cabdff1aSopenharmony_ci    src += stride_4x;
1506cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1507cabdff1aSopenharmony_ci              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1508cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1509cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
1510cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1511cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1512cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1513cabdff1aSopenharmony_ci              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1514cabdff1aSopenharmony_ci    dst += stride_4x;
1515cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1516cabdff1aSopenharmony_ci              dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1517cabdff1aSopenharmony_ci    dst -= stride_4x;
1518cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1519cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
1520cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1521cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1522cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1523cabdff1aSopenharmony_ci    vec0 = __lasx_xvilvl_b(dst0, src0);
1524cabdff1aSopenharmony_ci    vec1 = __lasx_xvilvh_b(dst0, src0);
1525cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1526cabdff1aSopenharmony_ci              tmp0, tmp1);
1527cabdff1aSopenharmony_ci    tmp0 = __lasx_xvsra_h(tmp0, denom);
1528cabdff1aSopenharmony_ci    tmp1 = __lasx_xvsra_h(tmp1, denom);
1529cabdff1aSopenharmony_ci    DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1530cabdff1aSopenharmony_ci    tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
1531cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst, 0, 0);
1532cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1533cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
1534cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
1535cabdff1aSopenharmony_ci    dst += stride_4x;
1536cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst, 0, 4);
1537cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride, 0, 5);
1538cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
1539cabdff1aSopenharmony_ci    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
1540cabdff1aSopenharmony_ci}
1541cabdff1aSopenharmony_ci
1542cabdff1aSopenharmony_civoid ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
1543cabdff1aSopenharmony_ci                                     ptrdiff_t stride, int height,
1544cabdff1aSopenharmony_ci                                     int log2_denom, int weight_dst,
1545cabdff1aSopenharmony_ci                                     int weight_src, int offset)
1546cabdff1aSopenharmony_ci{
1547cabdff1aSopenharmony_ci    if (2 == height) {
1548cabdff1aSopenharmony_ci        avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src,
1549cabdff1aSopenharmony_ci                           weight_dst, offset);
1550cabdff1aSopenharmony_ci    } else if (4 == height) {
1551cabdff1aSopenharmony_ci        avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src,
1552cabdff1aSopenharmony_ci                           weight_dst, offset);
1553cabdff1aSopenharmony_ci    } else {
1554cabdff1aSopenharmony_ci        avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src,
1555cabdff1aSopenharmony_ci                           weight_dst, offset);
1556cabdff1aSopenharmony_ci    }
1557cabdff1aSopenharmony_ci}
1558cabdff1aSopenharmony_ci
1559cabdff1aSopenharmony_civoid ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
1560cabdff1aSopenharmony_ci                                    int height, int log2_denom,
1561cabdff1aSopenharmony_ci                                    int weight_src, int offset_in)
1562cabdff1aSopenharmony_ci{
1563cabdff1aSopenharmony_ci    uint32_t offset_val;
1564cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1565cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1566cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1567cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
1568cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
1569cabdff1aSopenharmony_ci    __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
1570cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1571cabdff1aSopenharmony_ci    __m256i wgt, denom, offset;
1572cabdff1aSopenharmony_ci
1573cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
1574cabdff1aSopenharmony_ci
1575cabdff1aSopenharmony_ci    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1576cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_val);
1577cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1578cabdff1aSopenharmony_ci
1579cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1580cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1581cabdff1aSopenharmony_ci    src += stride_4x;
1582cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1583cabdff1aSopenharmony_ci              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1584cabdff1aSopenharmony_ci    src -= stride_4x;
1585cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1586cabdff1aSopenharmony_ci              0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1587cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1588cabdff1aSopenharmony_ci              zero, src3, src0_l, src1_l, src2_l, src3_l);
1589cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1590cabdff1aSopenharmony_ci              zero, src3, src0_h, src1_h, src2_h, src3_h);
1591cabdff1aSopenharmony_ci    src0_l = __lasx_xvmul_h(wgt, src0_l);
1592cabdff1aSopenharmony_ci    src0_h = __lasx_xvmul_h(wgt, src0_h);
1593cabdff1aSopenharmony_ci    src1_l = __lasx_xvmul_h(wgt, src1_l);
1594cabdff1aSopenharmony_ci    src1_h = __lasx_xvmul_h(wgt, src1_h);
1595cabdff1aSopenharmony_ci    src2_l = __lasx_xvmul_h(wgt, src2_l);
1596cabdff1aSopenharmony_ci    src2_h = __lasx_xvmul_h(wgt, src2_h);
1597cabdff1aSopenharmony_ci    src3_l = __lasx_xvmul_h(wgt, src3_l);
1598cabdff1aSopenharmony_ci    src3_h = __lasx_xvmul_h(wgt, src3_h);
1599cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1600cabdff1aSopenharmony_ci              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1601cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1602cabdff1aSopenharmony_ci              src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1603cabdff1aSopenharmony_ci    src0_l = __lasx_xvmaxi_h(src0_l, 0);
1604cabdff1aSopenharmony_ci    src0_h = __lasx_xvmaxi_h(src0_h, 0);
1605cabdff1aSopenharmony_ci    src1_l = __lasx_xvmaxi_h(src1_l, 0);
1606cabdff1aSopenharmony_ci    src1_h = __lasx_xvmaxi_h(src1_h, 0);
1607cabdff1aSopenharmony_ci    src2_l = __lasx_xvmaxi_h(src2_l, 0);
1608cabdff1aSopenharmony_ci    src2_h = __lasx_xvmaxi_h(src2_h, 0);
1609cabdff1aSopenharmony_ci    src3_l = __lasx_xvmaxi_h(src3_l, 0);
1610cabdff1aSopenharmony_ci    src3_h = __lasx_xvmaxi_h(src3_h, 0);
1611cabdff1aSopenharmony_ci    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1612cabdff1aSopenharmony_ci    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1613cabdff1aSopenharmony_ci    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1614cabdff1aSopenharmony_ci    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1615cabdff1aSopenharmony_ci    src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1616cabdff1aSopenharmony_ci    src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1617cabdff1aSopenharmony_ci    src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1618cabdff1aSopenharmony_ci    src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1619cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0_l, src, 0, 0);
1620cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0_h, src, 8, 0);
1621cabdff1aSopenharmony_ci    src += stride;
1622cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0_l, src, 0, 2);
1623cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0_h, src, 8, 2);
1624cabdff1aSopenharmony_ci    src += stride;
1625cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1_l, src, 0, 0);
1626cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1_h, src, 8, 0);
1627cabdff1aSopenharmony_ci    src += stride;
1628cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1_l, src, 0, 2);
1629cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1_h, src, 8, 2);
1630cabdff1aSopenharmony_ci    src += stride;
1631cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src2_l, src, 0, 0);
1632cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src2_h, src, 8, 0);
1633cabdff1aSopenharmony_ci    src += stride;
1634cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src2_l, src, 0, 2);
1635cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src2_h, src, 8, 2);
1636cabdff1aSopenharmony_ci    src += stride;
1637cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src3_l, src, 0, 0);
1638cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src3_h, src, 8, 0);
1639cabdff1aSopenharmony_ci    src += stride;
1640cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src3_l, src, 0, 2);
1641cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src3_h, src, 8, 2);
1642cabdff1aSopenharmony_ci    src += stride;
1643cabdff1aSopenharmony_ci
1644cabdff1aSopenharmony_ci    if (16 == height) {
1645cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1646cabdff1aSopenharmony_ci                  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1647cabdff1aSopenharmony_ci        src += stride_4x;
1648cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1649cabdff1aSopenharmony_ci                  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1650cabdff1aSopenharmony_ci        src -= stride_4x;
1651cabdff1aSopenharmony_ci        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1652cabdff1aSopenharmony_ci                  tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1653cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1654cabdff1aSopenharmony_ci                  zero, src3, src0_l, src1_l, src2_l, src3_l);
1655cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1656cabdff1aSopenharmony_ci                  zero, src3, src0_h, src1_h, src2_h, src3_h);
1657cabdff1aSopenharmony_ci        src0_l = __lasx_xvmul_h(wgt, src0_l);
1658cabdff1aSopenharmony_ci        src0_h = __lasx_xvmul_h(wgt, src0_h);
1659cabdff1aSopenharmony_ci        src1_l = __lasx_xvmul_h(wgt, src1_l);
1660cabdff1aSopenharmony_ci        src1_h = __lasx_xvmul_h(wgt, src1_h);
1661cabdff1aSopenharmony_ci        src2_l = __lasx_xvmul_h(wgt, src2_l);
1662cabdff1aSopenharmony_ci        src2_h = __lasx_xvmul_h(wgt, src2_h);
1663cabdff1aSopenharmony_ci        src3_l = __lasx_xvmul_h(wgt, src3_l);
1664cabdff1aSopenharmony_ci        src3_h = __lasx_xvmul_h(wgt, src3_h);
1665cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l,
1666cabdff1aSopenharmony_ci                  offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1667cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l,
1668cabdff1aSopenharmony_ci                  offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1669cabdff1aSopenharmony_ci        src0_l = __lasx_xvmaxi_h(src0_l, 0);
1670cabdff1aSopenharmony_ci        src0_h = __lasx_xvmaxi_h(src0_h, 0);
1671cabdff1aSopenharmony_ci        src1_l = __lasx_xvmaxi_h(src1_l, 0);
1672cabdff1aSopenharmony_ci        src1_h = __lasx_xvmaxi_h(src1_h, 0);
1673cabdff1aSopenharmony_ci        src2_l = __lasx_xvmaxi_h(src2_l, 0);
1674cabdff1aSopenharmony_ci        src2_h = __lasx_xvmaxi_h(src2_h, 0);
1675cabdff1aSopenharmony_ci        src3_l = __lasx_xvmaxi_h(src3_l, 0);
1676cabdff1aSopenharmony_ci        src3_h = __lasx_xvmaxi_h(src3_h, 0);
1677cabdff1aSopenharmony_ci        src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1678cabdff1aSopenharmony_ci        src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1679cabdff1aSopenharmony_ci        src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1680cabdff1aSopenharmony_ci        src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1681cabdff1aSopenharmony_ci        src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1682cabdff1aSopenharmony_ci        src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1683cabdff1aSopenharmony_ci        src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1684cabdff1aSopenharmony_ci        src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1685cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src0_l, src, 0, 0);
1686cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src0_h, src, 8, 0);
1687cabdff1aSopenharmony_ci        src += stride;
1688cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src0_l, src, 0, 2);
1689cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src0_h, src, 8, 2);
1690cabdff1aSopenharmony_ci        src += stride;
1691cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src1_l, src, 0, 0);
1692cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src1_h, src, 8, 0);
1693cabdff1aSopenharmony_ci        src += stride;
1694cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src1_l, src, 0, 2);
1695cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src1_h, src, 8, 2);
1696cabdff1aSopenharmony_ci        src += stride;
1697cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src2_l, src, 0, 0);
1698cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src2_h, src, 8, 0);
1699cabdff1aSopenharmony_ci        src += stride;
1700cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src2_l, src, 0, 2);
1701cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src2_h, src, 8, 2);
1702cabdff1aSopenharmony_ci        src += stride;
1703cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src3_l, src, 0, 0);
1704cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src3_h, src, 8, 0);
1705cabdff1aSopenharmony_ci        src += stride;
1706cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src3_l, src, 0, 2);
1707cabdff1aSopenharmony_ci        __lasx_xvstelm_d(src3_h, src, 8, 2);
1708cabdff1aSopenharmony_ci    }
1709cabdff1aSopenharmony_ci}
1710cabdff1aSopenharmony_ci
1711cabdff1aSopenharmony_cistatic void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride,
1712cabdff1aSopenharmony_ci                             int32_t log2_denom, int32_t weight_src,
1713cabdff1aSopenharmony_ci                             int32_t offset_in)
1714cabdff1aSopenharmony_ci{
1715cabdff1aSopenharmony_ci    uint32_t offset_val;
1716cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1717cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1718cabdff1aSopenharmony_ci    __m256i wgt, zero = __lasx_xvldi(0);
1719cabdff1aSopenharmony_ci    __m256i src0, src0_h, src0_l;
1720cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1721cabdff1aSopenharmony_ci
1722cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
1723cabdff1aSopenharmony_ci
1724cabdff1aSopenharmony_ci    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1725cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_val);
1726cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1727cabdff1aSopenharmony_ci
1728cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1729cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1730cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1731cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1732cabdff1aSopenharmony_ci    src0_l = __lasx_xvilvl_b(zero, src0);
1733cabdff1aSopenharmony_ci    src0_h = __lasx_xvilvh_b(zero, src0);
1734cabdff1aSopenharmony_ci    src0_l = __lasx_xvmul_h(wgt, src0_l);
1735cabdff1aSopenharmony_ci    src0_h = __lasx_xvmul_h(wgt, src0_h);
1736cabdff1aSopenharmony_ci    src0_l = __lasx_xvsadd_h(src0_l, offset);
1737cabdff1aSopenharmony_ci    src0_h = __lasx_xvsadd_h(src0_h, offset);
1738cabdff1aSopenharmony_ci    src0_l = __lasx_xvmaxi_h(src0_l, 0);
1739cabdff1aSopenharmony_ci    src0_h = __lasx_xvmaxi_h(src0_h, 0);
1740cabdff1aSopenharmony_ci    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1741cabdff1aSopenharmony_ci    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1742cabdff1aSopenharmony_ci
1743cabdff1aSopenharmony_ci    src0 = __lasx_xvpickev_d(src0_h, src0_l);
1744cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src, 0, 0);
1745cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride, 0, 1);
1746cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1747cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1748cabdff1aSopenharmony_ci}
1749cabdff1aSopenharmony_ci
1750cabdff1aSopenharmony_cistatic void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom,
1751cabdff1aSopenharmony_ci                             int32_t src_weight, int32_t offset_in)
1752cabdff1aSopenharmony_ci{
1753cabdff1aSopenharmony_ci    __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0);
1754cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1755cabdff1aSopenharmony_ci    uint32_t offset_val;
1756cabdff1aSopenharmony_ci    uint8_t* src_tmp = src;
1757cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1758cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1759cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1760cabdff1aSopenharmony_ci
1761cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
1762cabdff1aSopenharmony_ci
1763cabdff1aSopenharmony_ci    wgt    = __lasx_xvreplgr2vr_h(src_weight);
1764cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_val);
1765cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1766cabdff1aSopenharmony_ci
1767cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1768cabdff1aSopenharmony_ci              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1769cabdff1aSopenharmony_ci    src_tmp += stride_4x;
1770cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1771cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1772cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1773cabdff1aSopenharmony_ci              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1774cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1775cabdff1aSopenharmony_ci    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1776cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l);
1777cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h);
1778cabdff1aSopenharmony_ci    src0_l = __lasx_xvmul_h(wgt, src0_l);
1779cabdff1aSopenharmony_ci    src0_h = __lasx_xvmul_h(wgt, src0_h);
1780cabdff1aSopenharmony_ci    src1_l = __lasx_xvmul_h(wgt, src1_l);
1781cabdff1aSopenharmony_ci    src1_h = __lasx_xvmul_h(wgt, src1_h);
1782cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1783cabdff1aSopenharmony_ci              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1784cabdff1aSopenharmony_ci    src0_l = __lasx_xvmaxi_h(src0_l, 0);
1785cabdff1aSopenharmony_ci    src0_h = __lasx_xvmaxi_h(src0_h, 0);
1786cabdff1aSopenharmony_ci    src1_l = __lasx_xvmaxi_h(src1_l, 0);
1787cabdff1aSopenharmony_ci    src1_h = __lasx_xvmaxi_h(src1_h, 0);
1788cabdff1aSopenharmony_ci    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1789cabdff1aSopenharmony_ci    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1790cabdff1aSopenharmony_ci    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1791cabdff1aSopenharmony_ci    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1792cabdff1aSopenharmony_ci
1793cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1);
1794cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src, 0, 0);
1795cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride, 0, 1);
1796cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1797cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1798cabdff1aSopenharmony_ci    src += stride_4x;
1799cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1, src, 0, 0);
1800cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1, src + stride, 0, 1);
1801cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1802cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1803cabdff1aSopenharmony_ci}
1804cabdff1aSopenharmony_ci
1805cabdff1aSopenharmony_cistatic void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride,
1806cabdff1aSopenharmony_ci                              int32_t log2_denom, int32_t src_weight,
1807cabdff1aSopenharmony_ci                              int32_t offset_in)
1808cabdff1aSopenharmony_ci{
1809cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
1810cabdff1aSopenharmony_ci    __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
1811cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1812cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
1813cabdff1aSopenharmony_ci    uint32_t offset_val;
1814cabdff1aSopenharmony_ci    uint8_t* src_tmp = src;
1815cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1816cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1817cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1818cabdff1aSopenharmony_ci
1819cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
1820cabdff1aSopenharmony_ci
1821cabdff1aSopenharmony_ci    wgt    = __lasx_xvreplgr2vr_h(src_weight);
1822cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_val);
1823cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1824cabdff1aSopenharmony_ci
1825cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1826cabdff1aSopenharmony_ci              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1827cabdff1aSopenharmony_ci    src_tmp += stride_4x;
1828cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1829cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1830cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1831cabdff1aSopenharmony_ci              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1832cabdff1aSopenharmony_ci    src_tmp += stride_4x;
1833cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1834cabdff1aSopenharmony_ci    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1835cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1836cabdff1aSopenharmony_ci              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1837cabdff1aSopenharmony_ci    src_tmp += stride_4x;
1838cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1839cabdff1aSopenharmony_ci    src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1840cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1841cabdff1aSopenharmony_ci              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1842cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1843cabdff1aSopenharmony_ci    src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1844cabdff1aSopenharmony_ci
1845cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3,
1846cabdff1aSopenharmony_ci              src0_l, src1_l, src2_l, src3_l);
1847cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
1848cabdff1aSopenharmony_ci              src0_h, src1_h, src2_h, src3_h);
1849cabdff1aSopenharmony_ci    src0_l = __lasx_xvmul_h(wgt, src0_l);
1850cabdff1aSopenharmony_ci    src0_h = __lasx_xvmul_h(wgt, src0_h);
1851cabdff1aSopenharmony_ci    src1_l = __lasx_xvmul_h(wgt, src1_l);
1852cabdff1aSopenharmony_ci    src1_h = __lasx_xvmul_h(wgt, src1_h);
1853cabdff1aSopenharmony_ci    src2_l = __lasx_xvmul_h(wgt, src2_l);
1854cabdff1aSopenharmony_ci    src2_h = __lasx_xvmul_h(wgt, src2_h);
1855cabdff1aSopenharmony_ci    src3_l = __lasx_xvmul_h(wgt, src3_l);
1856cabdff1aSopenharmony_ci    src3_h = __lasx_xvmul_h(wgt, src3_h);
1857cabdff1aSopenharmony_ci
1858cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1859cabdff1aSopenharmony_ci              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1860cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1861cabdff1aSopenharmony_ci              src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1862cabdff1aSopenharmony_ci
1863cabdff1aSopenharmony_ci    src0_l = __lasx_xvmaxi_h(src0_l, 0);
1864cabdff1aSopenharmony_ci    src0_h = __lasx_xvmaxi_h(src0_h, 0);
1865cabdff1aSopenharmony_ci    src1_l = __lasx_xvmaxi_h(src1_l, 0);
1866cabdff1aSopenharmony_ci    src1_h = __lasx_xvmaxi_h(src1_h, 0);
1867cabdff1aSopenharmony_ci    src2_l = __lasx_xvmaxi_h(src2_l, 0);
1868cabdff1aSopenharmony_ci    src2_h = __lasx_xvmaxi_h(src2_h, 0);
1869cabdff1aSopenharmony_ci    src3_l = __lasx_xvmaxi_h(src3_l, 0);
1870cabdff1aSopenharmony_ci    src3_h = __lasx_xvmaxi_h(src3_h, 0);
1871cabdff1aSopenharmony_ci    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1872cabdff1aSopenharmony_ci    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1873cabdff1aSopenharmony_ci    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1874cabdff1aSopenharmony_ci    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1875cabdff1aSopenharmony_ci    src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1876cabdff1aSopenharmony_ci    src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1877cabdff1aSopenharmony_ci    src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1878cabdff1aSopenharmony_ci    src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1879cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
1880cabdff1aSopenharmony_ci              src3_h, src3_l, src0, src1, src2, src3);
1881cabdff1aSopenharmony_ci
1882cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src, 0, 0);
1883cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride, 0, 1);
1884cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1885cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1886cabdff1aSopenharmony_ci    src += stride_4x;
1887cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1, src, 0, 0);
1888cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1, src + stride, 0, 1);
1889cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1890cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1891cabdff1aSopenharmony_ci    src += stride_4x;
1892cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src2, src, 0, 0);
1893cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src2, src + stride, 0, 1);
1894cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src2, src + stride_2x, 0, 2);
1895cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src2, src + stride_3x, 0, 3);
1896cabdff1aSopenharmony_ci    src += stride_4x;
1897cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src3, src, 0, 0);
1898cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src3, src + stride, 0, 1);
1899cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src3, src + stride_2x, 0, 2);
1900cabdff1aSopenharmony_ci    __lasx_xvstelm_d(src3, src + stride_3x, 0, 3);
1901cabdff1aSopenharmony_ci}
1902cabdff1aSopenharmony_ci
1903cabdff1aSopenharmony_civoid ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
1904cabdff1aSopenharmony_ci                                   int height, int log2_denom,
1905cabdff1aSopenharmony_ci                                   int weight_src, int offset)
1906cabdff1aSopenharmony_ci{
1907cabdff1aSopenharmony_ci    if (4 == height) {
1908cabdff1aSopenharmony_ci        avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset);
1909cabdff1aSopenharmony_ci    } else if (8 == height) {
1910cabdff1aSopenharmony_ci        avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset);
1911cabdff1aSopenharmony_ci    } else {
1912cabdff1aSopenharmony_ci        avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset);
1913cabdff1aSopenharmony_ci    }
1914cabdff1aSopenharmony_ci}
1915cabdff1aSopenharmony_ci
1916cabdff1aSopenharmony_cistatic void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride,
1917cabdff1aSopenharmony_ci                             int32_t log2_denom, int32_t weight_src,
1918cabdff1aSopenharmony_ci                             int32_t offset_in)
1919cabdff1aSopenharmony_ci{
1920cabdff1aSopenharmony_ci    uint32_t offset_val;
1921cabdff1aSopenharmony_ci    __m256i wgt, zero = __lasx_xvldi(0);
1922cabdff1aSopenharmony_ci    __m256i src0, tmp0, tmp1, denom, offset;
1923cabdff1aSopenharmony_ci
1924cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
1925cabdff1aSopenharmony_ci
1926cabdff1aSopenharmony_ci    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1927cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_val);
1928cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1929cabdff1aSopenharmony_ci
1930cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1931cabdff1aSopenharmony_ci    src0 = __lasx_xvilvl_w(tmp1, tmp0);
1932cabdff1aSopenharmony_ci    src0 = __lasx_xvilvl_b(zero, src0);
1933cabdff1aSopenharmony_ci    src0 = __lasx_xvmul_h(wgt, src0);
1934cabdff1aSopenharmony_ci    src0 = __lasx_xvsadd_h(src0, offset);
1935cabdff1aSopenharmony_ci    src0 = __lasx_xvmaxi_h(src0, 0);
1936cabdff1aSopenharmony_ci    src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1937cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0, src, 0, 0);
1938cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0, src + stride, 0, 1);
1939cabdff1aSopenharmony_ci}
1940cabdff1aSopenharmony_ci
1941cabdff1aSopenharmony_cistatic void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride,
1942cabdff1aSopenharmony_ci                             int32_t log2_denom, int32_t weight_src,
1943cabdff1aSopenharmony_ci                             int32_t offset_in)
1944cabdff1aSopenharmony_ci{
1945cabdff1aSopenharmony_ci    __m256i wgt;
1946cabdff1aSopenharmony_ci    __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset;
1947cabdff1aSopenharmony_ci    uint32_t offset_val;
1948cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1949cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1950cabdff1aSopenharmony_ci
1951cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
1952cabdff1aSopenharmony_ci
1953cabdff1aSopenharmony_ci    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1954cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_val);
1955cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1956cabdff1aSopenharmony_ci
1957cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1958cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1959cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1960cabdff1aSopenharmony_ci    src0 = __lasx_xvilvl_w(tmp1, tmp0);
1961cabdff1aSopenharmony_ci    src0 = __lasx_vext2xv_hu_bu(src0);
1962cabdff1aSopenharmony_ci    src0 = __lasx_xvmul_h(wgt, src0);
1963cabdff1aSopenharmony_ci    src0 = __lasx_xvsadd_h(src0, offset);
1964cabdff1aSopenharmony_ci    src0 = __lasx_xvmaxi_h(src0, 0);
1965cabdff1aSopenharmony_ci    src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1966cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0, src, 0, 0);
1967cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0, src + stride, 0, 1);
1968cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0, src + stride_2x, 0, 4);
1969cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0, src + stride_3x, 0, 5);
1970cabdff1aSopenharmony_ci}
1971cabdff1aSopenharmony_ci
1972cabdff1aSopenharmony_cistatic void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride,
1973cabdff1aSopenharmony_ci                             int32_t log2_denom, int32_t weight_src,
1974cabdff1aSopenharmony_ci                             int32_t offset_in)
1975cabdff1aSopenharmony_ci{
1976cabdff1aSopenharmony_ci    __m256i src0, src0_h, src0_l;
1977cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1978cabdff1aSopenharmony_ci    __m256i wgt, zero = __lasx_xvldi(0);
1979cabdff1aSopenharmony_ci    uint32_t offset_val;
1980cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1981cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1982cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1983cabdff1aSopenharmony_ci
1984cabdff1aSopenharmony_ci    offset_val = (unsigned) offset_in << log2_denom;
1985cabdff1aSopenharmony_ci
1986cabdff1aSopenharmony_ci    wgt    = __lasx_xvreplgr2vr_h(weight_src);
1987cabdff1aSopenharmony_ci    offset = __lasx_xvreplgr2vr_h(offset_val);
1988cabdff1aSopenharmony_ci    denom  = __lasx_xvreplgr2vr_h(log2_denom);
1989cabdff1aSopenharmony_ci
1990cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1991cabdff1aSopenharmony_ci              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1992cabdff1aSopenharmony_ci    src += stride_4x;
1993cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1994cabdff1aSopenharmony_ci              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1995cabdff1aSopenharmony_ci    src -= stride_4x;
1996cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
1997cabdff1aSopenharmony_ci              tmp5, tmp0, tmp1, tmp2, tmp3);
1998cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1999cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
2000cabdff1aSopenharmony_ci    src0_l = __lasx_xvilvl_b(zero, src0);
2001cabdff1aSopenharmony_ci    src0_h = __lasx_xvilvh_b(zero, src0);
2002cabdff1aSopenharmony_ci    src0_l = __lasx_xvmul_h(wgt, src0_l);
2003cabdff1aSopenharmony_ci    src0_h = __lasx_xvmul_h(wgt, src0_h);
2004cabdff1aSopenharmony_ci    src0_l = __lasx_xvsadd_h(src0_l, offset);
2005cabdff1aSopenharmony_ci    src0_h = __lasx_xvsadd_h(src0_h, offset);
2006cabdff1aSopenharmony_ci    src0_l = __lasx_xvmaxi_h(src0_l, 0);
2007cabdff1aSopenharmony_ci    src0_h = __lasx_xvmaxi_h(src0_h, 0);
2008cabdff1aSopenharmony_ci    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
2009cabdff1aSopenharmony_ci    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
2010cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0_l, src, 0, 0);
2011cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0_l, src + stride, 0, 1);
2012cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0);
2013cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1);
2014cabdff1aSopenharmony_ci    src += stride_4x;
2015cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0_l, src, 0, 4);
2016cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0_l, src + stride, 0, 5);
2017cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4);
2018cabdff1aSopenharmony_ci    __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5);
2019cabdff1aSopenharmony_ci}
2020cabdff1aSopenharmony_ci
2021cabdff1aSopenharmony_civoid ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
2022cabdff1aSopenharmony_ci                                   int height, int log2_denom,
2023cabdff1aSopenharmony_ci                                   int weight_src, int offset)
2024cabdff1aSopenharmony_ci{
2025cabdff1aSopenharmony_ci    if (2 == height) {
2026cabdff1aSopenharmony_ci        avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset);
2027cabdff1aSopenharmony_ci    } else if (4 == height) {
2028cabdff1aSopenharmony_ci        avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset);
2029cabdff1aSopenharmony_ci    } else {
2030cabdff1aSopenharmony_ci        avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset);
2031cabdff1aSopenharmony_ci    }
2032cabdff1aSopenharmony_ci}
2033cabdff1aSopenharmony_ci
2034cabdff1aSopenharmony_civoid ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2035cabdff1aSopenharmony_ci{
2036cabdff1aSopenharmony_ci    __m256i src0, dst0, dst1, dst2, dst3, zero;
2037cabdff1aSopenharmony_ci    __m256i tmp0, tmp1;
2038cabdff1aSopenharmony_ci    uint8_t* _dst1 = _dst + stride;
2039cabdff1aSopenharmony_ci    uint8_t* _dst2 = _dst1 + stride;
2040cabdff1aSopenharmony_ci    uint8_t* _dst3 = _dst2 + stride;
2041cabdff1aSopenharmony_ci
2042cabdff1aSopenharmony_ci    src0 = __lasx_xvld(_src, 0);
2043cabdff1aSopenharmony_ci    dst0 = __lasx_xvldrepl_w(_dst, 0);
2044cabdff1aSopenharmony_ci    dst1 = __lasx_xvldrepl_w(_dst1, 0);
2045cabdff1aSopenharmony_ci    dst2 = __lasx_xvldrepl_w(_dst2, 0);
2046cabdff1aSopenharmony_ci    dst3 = __lasx_xvldrepl_w(_dst3, 0);
2047cabdff1aSopenharmony_ci    tmp0 = __lasx_xvilvl_w(dst1, dst0);
2048cabdff1aSopenharmony_ci    tmp1 = __lasx_xvilvl_w(dst3, dst2);
2049cabdff1aSopenharmony_ci    dst0 = __lasx_xvilvl_d(tmp1, tmp0);
2050cabdff1aSopenharmony_ci    tmp0 = __lasx_vext2xv_hu_bu(dst0);
2051cabdff1aSopenharmony_ci    zero = __lasx_xvldi(0);
2052cabdff1aSopenharmony_ci    tmp1 = __lasx_xvadd_h(src0, tmp0);
2053cabdff1aSopenharmony_ci    dst0 = __lasx_xvpickev_b(tmp1, tmp1);
2054cabdff1aSopenharmony_ci    __lasx_xvstelm_w(dst0, _dst, 0, 0);
2055cabdff1aSopenharmony_ci    __lasx_xvstelm_w(dst0, _dst1, 0, 1);
2056cabdff1aSopenharmony_ci    __lasx_xvstelm_w(dst0, _dst2, 0, 4);
2057cabdff1aSopenharmony_ci    __lasx_xvstelm_w(dst0, _dst3, 0, 5);
2058cabdff1aSopenharmony_ci    __lasx_xvst(zero, _src, 0);
2059cabdff1aSopenharmony_ci}
2060cabdff1aSopenharmony_ci
2061cabdff1aSopenharmony_civoid ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2062cabdff1aSopenharmony_ci{
2063cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
2064cabdff1aSopenharmony_ci    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2065cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3;
2066cabdff1aSopenharmony_ci    __m256i zero = __lasx_xvldi(0);
2067cabdff1aSopenharmony_ci    uint8_t *_dst1 = _dst + stride;
2068cabdff1aSopenharmony_ci    uint8_t *_dst2 = _dst1 + stride;
2069cabdff1aSopenharmony_ci    uint8_t *_dst3 = _dst2 + stride;
2070cabdff1aSopenharmony_ci    uint8_t *_dst4 = _dst3 + stride;
2071cabdff1aSopenharmony_ci    uint8_t *_dst5 = _dst4 + stride;
2072cabdff1aSopenharmony_ci    uint8_t *_dst6 = _dst5 + stride;
2073cabdff1aSopenharmony_ci    uint8_t *_dst7 = _dst6 + stride;
2074cabdff1aSopenharmony_ci
2075cabdff1aSopenharmony_ci    src0 = __lasx_xvld(_src, 0);
2076cabdff1aSopenharmony_ci    src1 = __lasx_xvld(_src, 32);
2077cabdff1aSopenharmony_ci    src2 = __lasx_xvld(_src, 64);
2078cabdff1aSopenharmony_ci    src3 = __lasx_xvld(_src, 96);
2079cabdff1aSopenharmony_ci    dst0 = __lasx_xvldrepl_d(_dst, 0);
2080cabdff1aSopenharmony_ci    dst1 = __lasx_xvldrepl_d(_dst1, 0);
2081cabdff1aSopenharmony_ci    dst2 = __lasx_xvldrepl_d(_dst2, 0);
2082cabdff1aSopenharmony_ci    dst3 = __lasx_xvldrepl_d(_dst3, 0);
2083cabdff1aSopenharmony_ci    dst4 = __lasx_xvldrepl_d(_dst4, 0);
2084cabdff1aSopenharmony_ci    dst5 = __lasx_xvldrepl_d(_dst5, 0);
2085cabdff1aSopenharmony_ci    dst6 = __lasx_xvldrepl_d(_dst6, 0);
2086cabdff1aSopenharmony_ci    dst7 = __lasx_xvldrepl_d(_dst7, 0);
2087cabdff1aSopenharmony_ci    tmp0 = __lasx_xvilvl_d(dst1, dst0);
2088cabdff1aSopenharmony_ci    tmp1 = __lasx_xvilvl_d(dst3, dst2);
2089cabdff1aSopenharmony_ci    tmp2 = __lasx_xvilvl_d(dst5, dst4);
2090cabdff1aSopenharmony_ci    tmp3 = __lasx_xvilvl_d(dst7, dst6);
2091cabdff1aSopenharmony_ci    dst0 = __lasx_vext2xv_hu_bu(tmp0);
2092cabdff1aSopenharmony_ci    dst1 = __lasx_vext2xv_hu_bu(tmp1);
2093cabdff1aSopenharmony_ci    dst1 = __lasx_vext2xv_hu_bu(tmp1);
2094cabdff1aSopenharmony_ci    dst2 = __lasx_vext2xv_hu_bu(tmp2);
2095cabdff1aSopenharmony_ci    dst3 = __lasx_vext2xv_hu_bu(tmp3);
2096cabdff1aSopenharmony_ci    tmp0 = __lasx_xvadd_h(src0, dst0);
2097cabdff1aSopenharmony_ci    tmp1 = __lasx_xvadd_h(src1, dst1);
2098cabdff1aSopenharmony_ci    tmp2 = __lasx_xvadd_h(src2, dst2);
2099cabdff1aSopenharmony_ci    tmp3 = __lasx_xvadd_h(src3, dst3);
2100cabdff1aSopenharmony_ci    dst1 = __lasx_xvpickev_b(tmp1, tmp0);
2101cabdff1aSopenharmony_ci    dst2 = __lasx_xvpickev_b(tmp3, tmp2);
2102cabdff1aSopenharmony_ci    __lasx_xvst(zero, _src, 0);
2103cabdff1aSopenharmony_ci    __lasx_xvst(zero, _src, 32);
2104cabdff1aSopenharmony_ci    __lasx_xvst(zero, _src, 64);
2105cabdff1aSopenharmony_ci    __lasx_xvst(zero, _src, 96);
2106cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, _dst, 0, 0);
2107cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, _dst1, 0, 2);
2108cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, _dst2, 0, 1);
2109cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst1, _dst3, 0, 3);
2110cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, _dst4, 0, 0);
2111cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, _dst5, 0, 2);
2112cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, _dst6, 0, 1);
2113cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst2, _dst7, 0, 3);
2114cabdff1aSopenharmony_ci}
2115