1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Loongson LASX optimized h264chroma
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2020 Loongson Technology Corporation Limited
5cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * This file is part of FFmpeg.
8cabdff1aSopenharmony_ci *
9cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci *
14cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci * Lesser General Public License for more details.
18cabdff1aSopenharmony_ci *
19cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci */
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci#include "h264chroma_lasx.h"
25cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
26cabdff1aSopenharmony_ci#include "libavutil/avassert.h"
27cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cistatic const uint8_t chroma_mask_arr[64] = {
30cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
31cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
32cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
33cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
34cabdff1aSopenharmony_ci};
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hv_8x4_lasx(uint8_t *src, uint8_t *dst,
37cabdff1aSopenharmony_ci                             ptrdiff_t stride, uint32_t coef_hor0,
38cabdff1aSopenharmony_ci                             uint32_t coef_hor1, uint32_t coef_ver0,
39cabdff1aSopenharmony_ci                             uint32_t coef_ver1)
40cabdff1aSopenharmony_ci{
41cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
42cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
43cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride_2x << 1;
44cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, out;
45cabdff1aSopenharmony_ci    __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
46cabdff1aSopenharmony_ci    __m256i mask;
47cabdff1aSopenharmony_ci    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
48cabdff1aSopenharmony_ci    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
49cabdff1aSopenharmony_ci    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
50cabdff1aSopenharmony_ci    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
51cabdff1aSopenharmony_ci    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
54cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
55cabdff1aSopenharmony_ci              src1, src2, src3, src4);
56cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
57cabdff1aSopenharmony_ci    src0 = __lasx_xvshuf_b(src0, src0, mask);
58cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
59cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
60cabdff1aSopenharmony_ci    res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
61cabdff1aSopenharmony_ci    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
62cabdff1aSopenharmony_ci    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
63cabdff1aSopenharmony_ci    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
64cabdff1aSopenharmony_ci    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
65cabdff1aSopenharmony_ci    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
66cabdff1aSopenharmony_ci    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
67cabdff1aSopenharmony_ci    out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
68cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst, 0, 0);
69cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride, 0, 2);
70cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
71cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
72cabdff1aSopenharmony_ci}
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hv_8x8_lasx(uint8_t *src, uint8_t *dst,
75cabdff1aSopenharmony_ci                             ptrdiff_t stride, uint32_t coef_hor0,
76cabdff1aSopenharmony_ci                             uint32_t coef_hor1, uint32_t coef_ver0,
77cabdff1aSopenharmony_ci                             uint32_t coef_ver1)
78cabdff1aSopenharmony_ci{
79cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
80cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
81cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
82cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
83cabdff1aSopenharmony_ci    __m256i out0, out1;
84cabdff1aSopenharmony_ci    __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
85cabdff1aSopenharmony_ci    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
86cabdff1aSopenharmony_ci    __m256i mask;
87cabdff1aSopenharmony_ci    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
88cabdff1aSopenharmony_ci    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
89cabdff1aSopenharmony_ci    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
90cabdff1aSopenharmony_ci    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
91cabdff1aSopenharmony_ci    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
94cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
95cabdff1aSopenharmony_ci              src1, src2, src3, src4);
96cabdff1aSopenharmony_ci    src += stride_4x;
97cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
98cabdff1aSopenharmony_ci              src5, src6, src7, src8);
99cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
100cabdff1aSopenharmony_ci              src8, src7, 0x20, src1, src3, src5, src7);
101cabdff1aSopenharmony_ci    src0 = __lasx_xvshuf_b(src0, src0, mask);
102cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
103cabdff1aSopenharmony_ci              src7, mask, src1, src3, src5, src7);
104cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
105cabdff1aSopenharmony_ci              coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
106cabdff1aSopenharmony_ci    res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
107cabdff1aSopenharmony_ci    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
108cabdff1aSopenharmony_ci    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
109cabdff1aSopenharmony_ci    res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
110cabdff1aSopenharmony_ci    res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
111cabdff1aSopenharmony_ci    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
112cabdff1aSopenharmony_ci    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
113cabdff1aSopenharmony_ci    res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
114cabdff1aSopenharmony_ci    res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
115cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvmadd_h, res_vt0, res_hz0, coeff_vt_vec1, res_vt1, res_hz1, coeff_vt_vec1,
116cabdff1aSopenharmony_ci              res_vt2, res_hz2, coeff_vt_vec1, res_vt3, res_hz3, coeff_vt_vec1,
117cabdff1aSopenharmony_ci              res_vt0, res_vt1, res_vt2, res_vt3);
118cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, out0, out1);
119cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst, 0, 0);
120cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
121cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
122cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
123cabdff1aSopenharmony_ci    dst += stride_4x;
124cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst, 0, 0);
125cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
126cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
127cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
128cabdff1aSopenharmony_ci}
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_8x4_lasx(uint8_t *src, uint8_t *dst,
131cabdff1aSopenharmony_ci                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
132cabdff1aSopenharmony_ci{
133cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
134cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
135cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, out;
136cabdff1aSopenharmony_ci    __m256i res0, res1;
137cabdff1aSopenharmony_ci    __m256i mask;
138cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
139cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
140cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
143cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
144cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src1, src2);
145cabdff1aSopenharmony_ci    src3 = __lasx_xvldx(src, stride_3x);
146cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
147cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
148cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
149cabdff1aSopenharmony_ci    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
150cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst, 0, 0);
151cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride, 0, 2);
152cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
153cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_ci}
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_8x8_lasx(uint8_t *src, uint8_t *dst,
158cabdff1aSopenharmony_ci                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
159cabdff1aSopenharmony_ci{
160cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
161cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
162cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
163cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
164cabdff1aSopenharmony_ci    __m256i out0, out1;
165cabdff1aSopenharmony_ci    __m256i res0, res1, res2, res3;
166cabdff1aSopenharmony_ci    __m256i mask;
167cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
168cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
169cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
172cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
173cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
174cabdff1aSopenharmony_ci              src1, src2, src3, src4);
175cabdff1aSopenharmony_ci    src += stride_4x;
176cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src5, src6);
177cabdff1aSopenharmony_ci    src7 = __lasx_xvldx(src, stride_3x);
178cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
179cabdff1aSopenharmony_ci              src7, src6, 0x20, src0, src2, src4, src6);
180cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, mask,
181cabdff1aSopenharmony_ci              src6, src6, mask, src0, src2, src4, src6);
182cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
183cabdff1aSopenharmony_ci              coeff_vec, res0, res1, res2, res3);
184cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
185cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst, 0, 0);
186cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
187cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
188cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
189cabdff1aSopenharmony_ci    dst += stride_4x;
190cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst, 0, 0);
191cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
192cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
193cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
194cabdff1aSopenharmony_ci}
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_nonmult_lasx(uint8_t *src,
197cabdff1aSopenharmony_ci                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
198cabdff1aSopenharmony_ci                             uint32_t coeff1, int32_t height)
199cabdff1aSopenharmony_ci{
200cabdff1aSopenharmony_ci    uint32_t row;
201cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
202cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
203cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
204cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, out;
205cabdff1aSopenharmony_ci    __m256i res0, res1;
206cabdff1aSopenharmony_ci    __m256i mask;
207cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
208cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
209cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_ci    mask = __lasx_xvld(chroma_mask_arr, 0);
212cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_ci    for (row = height >> 2; row--;) {
215cabdff1aSopenharmony_ci        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
216cabdff1aSopenharmony_ci                  src0, src1, src2, src3);
217cabdff1aSopenharmony_ci        src += stride_4x;
218cabdff1aSopenharmony_ci        DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
219cabdff1aSopenharmony_ci        DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
220cabdff1aSopenharmony_ci        DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
221cabdff1aSopenharmony_ci        out = __lasx_xvssrarni_bu_h(res1, res0, 6);
222cabdff1aSopenharmony_ci        __lasx_xvstelm_d(out, dst, 0, 0);
223cabdff1aSopenharmony_ci        __lasx_xvstelm_d(out, dst + stride, 0, 2);
224cabdff1aSopenharmony_ci        __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
225cabdff1aSopenharmony_ci        __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
226cabdff1aSopenharmony_ci        dst += stride_4x;
227cabdff1aSopenharmony_ci    }
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci    if ((height & 3)) {
230cabdff1aSopenharmony_ci        src0 = __lasx_xvld(src, 0);
231cabdff1aSopenharmony_ci        src1 = __lasx_xvldx(src, stride);
232cabdff1aSopenharmony_ci        src1 = __lasx_xvpermi_q(src1, src0, 0x20);
233cabdff1aSopenharmony_ci        src0 = __lasx_xvshuf_b(src1, src1, mask);
234cabdff1aSopenharmony_ci        res0 = __lasx_xvdp2_h_bu(src0, coeff_vec);
235cabdff1aSopenharmony_ci        out  = __lasx_xvssrarni_bu_h(res0, res0, 6);
236cabdff1aSopenharmony_ci        __lasx_xvstelm_d(out, dst, 0, 0);
237cabdff1aSopenharmony_ci        dst += stride;
238cabdff1aSopenharmony_ci        __lasx_xvstelm_d(out, dst, 0, 2);
239cabdff1aSopenharmony_ci    }
240cabdff1aSopenharmony_ci}
241cabdff1aSopenharmony_ci
242cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_vt_8x4_lasx(uint8_t *src, uint8_t *dst,
243cabdff1aSopenharmony_ci                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
244cabdff1aSopenharmony_ci{
245cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
246cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
247cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, out;
248cabdff1aSopenharmony_ci    __m256i res0, res1;
249cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
250cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
251cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
254cabdff1aSopenharmony_ci    src0 = __lasx_xvld(src, 0);
255cabdff1aSopenharmony_ci    src += stride;
256cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
257cabdff1aSopenharmony_ci              src1, src2, src3, src4);
258cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
259cabdff1aSopenharmony_ci              src4, src3, 0x20, src0, src1, src2, src3);
260cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
261cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
262cabdff1aSopenharmony_ci    out  = __lasx_xvssrarni_bu_h(res1, res0, 6);
263cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst, 0, 0);
264cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride, 0, 2);
265cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
266cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
267cabdff1aSopenharmony_ci}
268cabdff1aSopenharmony_ci
269cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_vt_8x8_lasx(uint8_t *src, uint8_t *dst,
270cabdff1aSopenharmony_ci                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
271cabdff1aSopenharmony_ci{
272cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
273cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
274cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
275cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
276cabdff1aSopenharmony_ci    __m256i out0, out1;
277cabdff1aSopenharmony_ci    __m256i res0, res1, res2, res3;
278cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
279cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
280cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
283cabdff1aSopenharmony_ci    src0 = __lasx_xvld(src, 0);
284cabdff1aSopenharmony_ci    src += stride;
285cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
286cabdff1aSopenharmony_ci              src1, src2, src3, src4);
287cabdff1aSopenharmony_ci    src += stride_4x;
288cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
289cabdff1aSopenharmony_ci              src5, src6, src7, src8);
290cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
291cabdff1aSopenharmony_ci              src4, src3, 0x20, src0, src1, src2, src3);
292cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
293cabdff1aSopenharmony_ci              src8, src7, 0x20, src4, src5, src6, src7);
294cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
295cabdff1aSopenharmony_ci              src0, src2, src4, src6);
296cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec,
297cabdff1aSopenharmony_ci              src6, coeff_vec, res0, res1, res2, res3);
298cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
299cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst, 0, 0);
300cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
301cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
302cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
303cabdff1aSopenharmony_ci    dst += stride_4x;
304cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst, 0, 0);
305cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
306cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
307cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
308cabdff1aSopenharmony_ci}
309cabdff1aSopenharmony_ci
310cabdff1aSopenharmony_cistatic av_always_inline void copy_width8x8_lasx(uint8_t *src, uint8_t *dst,
311cabdff1aSopenharmony_ci                             ptrdiff_t stride)
312cabdff1aSopenharmony_ci{
313cabdff1aSopenharmony_ci    uint64_t tmp[8];
314cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3, stride_4;
315cabdff1aSopenharmony_ci    __asm__ volatile (
316cabdff1aSopenharmony_ci        "slli.d   %[stride_2],     %[stride],     1             \n\t"
317cabdff1aSopenharmony_ci        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
318cabdff1aSopenharmony_ci        "slli.d   %[stride_4],     %[stride_2],   1             \n\t"
319cabdff1aSopenharmony_ci        "ld.d     %[tmp0],         %[src],        0x0           \n\t"
320cabdff1aSopenharmony_ci        "ldx.d    %[tmp1],         %[src],        %[stride]     \n\t"
321cabdff1aSopenharmony_ci        "ldx.d    %[tmp2],         %[src],        %[stride_2]   \n\t"
322cabdff1aSopenharmony_ci        "ldx.d    %[tmp3],         %[src],        %[stride_3]   \n\t"
323cabdff1aSopenharmony_ci        "add.d    %[src],          %[src],        %[stride_4]   \n\t"
324cabdff1aSopenharmony_ci        "ld.d     %[tmp4],         %[src],        0x0           \n\t"
325cabdff1aSopenharmony_ci        "ldx.d    %[tmp5],         %[src],        %[stride]     \n\t"
326cabdff1aSopenharmony_ci        "ldx.d    %[tmp6],         %[src],        %[stride_2]   \n\t"
327cabdff1aSopenharmony_ci        "ldx.d    %[tmp7],         %[src],        %[stride_3]   \n\t"
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci        "st.d     %[tmp0],         %[dst],        0x0           \n\t"
330cabdff1aSopenharmony_ci        "stx.d    %[tmp1],         %[dst],        %[stride]     \n\t"
331cabdff1aSopenharmony_ci        "stx.d    %[tmp2],         %[dst],        %[stride_2]   \n\t"
332cabdff1aSopenharmony_ci        "stx.d    %[tmp3],         %[dst],        %[stride_3]   \n\t"
333cabdff1aSopenharmony_ci        "add.d    %[dst],          %[dst],        %[stride_4]   \n\t"
334cabdff1aSopenharmony_ci        "st.d     %[tmp4],         %[dst],        0x0           \n\t"
335cabdff1aSopenharmony_ci        "stx.d    %[tmp5],         %[dst],        %[stride]     \n\t"
336cabdff1aSopenharmony_ci        "stx.d    %[tmp6],         %[dst],        %[stride_2]   \n\t"
337cabdff1aSopenharmony_ci        "stx.d    %[tmp7],         %[dst],        %[stride_3]   \n\t"
338cabdff1aSopenharmony_ci        : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
339cabdff1aSopenharmony_ci          [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
340cabdff1aSopenharmony_ci          [tmp4]"=&r"(tmp[4]),        [tmp5]"=&r"(tmp[5]),
341cabdff1aSopenharmony_ci          [tmp6]"=&r"(tmp[6]),        [tmp7]"=&r"(tmp[7]),
342cabdff1aSopenharmony_ci          [dst]"+&r"(dst),            [src]"+&r"(src),
343cabdff1aSopenharmony_ci          [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
344cabdff1aSopenharmony_ci          [stride_4]"=&r"(stride_4)
345cabdff1aSopenharmony_ci        : [stride]"r"(stride)
346cabdff1aSopenharmony_ci        : "memory"
347cabdff1aSopenharmony_ci    );
348cabdff1aSopenharmony_ci}
349cabdff1aSopenharmony_ci
350cabdff1aSopenharmony_cistatic av_always_inline void copy_width8x4_lasx(uint8_t *src, uint8_t *dst,
351cabdff1aSopenharmony_ci                             ptrdiff_t stride)
352cabdff1aSopenharmony_ci{
353cabdff1aSopenharmony_ci    uint64_t tmp[4];
354cabdff1aSopenharmony_ci    ptrdiff_t stride_2, stride_3;
355cabdff1aSopenharmony_ci    __asm__ volatile (
356cabdff1aSopenharmony_ci        "slli.d   %[stride_2],     %[stride],     1             \n\t"
357cabdff1aSopenharmony_ci        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
358cabdff1aSopenharmony_ci        "ld.d     %[tmp0],         %[src],        0x0           \n\t"
359cabdff1aSopenharmony_ci        "ldx.d    %[tmp1],         %[src],        %[stride]     \n\t"
360cabdff1aSopenharmony_ci        "ldx.d    %[tmp2],         %[src],        %[stride_2]   \n\t"
361cabdff1aSopenharmony_ci        "ldx.d    %[tmp3],         %[src],        %[stride_3]   \n\t"
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci        "st.d     %[tmp0],         %[dst],        0x0           \n\t"
364cabdff1aSopenharmony_ci        "stx.d    %[tmp1],         %[dst],        %[stride]     \n\t"
365cabdff1aSopenharmony_ci        "stx.d    %[tmp2],         %[dst],        %[stride_2]   \n\t"
366cabdff1aSopenharmony_ci        "stx.d    %[tmp3],         %[dst],        %[stride_3]   \n\t"
367cabdff1aSopenharmony_ci        : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
368cabdff1aSopenharmony_ci          [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
369cabdff1aSopenharmony_ci          [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3)
370cabdff1aSopenharmony_ci        : [stride]"r"(stride), [dst]"r"(dst), [src]"r"(src)
371cabdff1aSopenharmony_ci        : "memory"
372cabdff1aSopenharmony_ci    );
373cabdff1aSopenharmony_ci}
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_cistatic void avc_chroma_hv_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
376cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
377cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1,
378cabdff1aSopenharmony_ci                                  int32_t height)
379cabdff1aSopenharmony_ci{
380cabdff1aSopenharmony_ci    if (4 == height) {
381cabdff1aSopenharmony_ci        avc_chroma_hv_8x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
382cabdff1aSopenharmony_ci                               coef_ver1);
383cabdff1aSopenharmony_ci    } else if (8 == height) {
384cabdff1aSopenharmony_ci        avc_chroma_hv_8x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
385cabdff1aSopenharmony_ci                               coef_ver1);
386cabdff1aSopenharmony_ci    }
387cabdff1aSopenharmony_ci}
388cabdff1aSopenharmony_ci
389cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
390cabdff1aSopenharmony_ci                                   uint32_t coef_hor0, uint32_t coef_hor1,
391cabdff1aSopenharmony_ci                                   uint32_t coef_ver0, uint32_t coef_ver1)
392cabdff1aSopenharmony_ci{
393cabdff1aSopenharmony_ci    ptrdiff_t stride_2 = stride << 1;
394cabdff1aSopenharmony_ci    __m256i src0, src1, src2;
395cabdff1aSopenharmony_ci    __m256i res_hz, res_vt;
396cabdff1aSopenharmony_ci    __m256i mask;
397cabdff1aSopenharmony_ci    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
398cabdff1aSopenharmony_ci    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
399cabdff1aSopenharmony_ci    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
400cabdff1aSopenharmony_ci    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
401cabdff1aSopenharmony_ci    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
402cabdff1aSopenharmony_ci    __m256i coeff_vt_vec  = __lasx_xvpermi_q(coeff_vt_vec1, coeff_vt_vec0, 0x02);
403cabdff1aSopenharmony_ci
404cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
405cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
406cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src0, src1);
407cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(src0, src1, 0x02);
408cabdff1aSopenharmony_ci    res_hz = __lasx_xvdp2_h_bu(src0, coeff_hz_vec);
409cabdff1aSopenharmony_ci    res_vt = __lasx_xvmul_h(res_hz, coeff_vt_vec);
410cabdff1aSopenharmony_ci    res_hz = __lasx_xvpermi_q(res_hz, res_vt, 0x01);
411cabdff1aSopenharmony_ci    res_vt = __lasx_xvadd_h(res_hz, res_vt);
412cabdff1aSopenharmony_ci    res_vt = __lasx_xvssrarni_bu_h(res_vt, res_vt, 6);
413cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_vt, dst, 0, 0);
414cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_vt, dst + stride, 0, 1);
415cabdff1aSopenharmony_ci}
416cabdff1aSopenharmony_ci
417cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
418cabdff1aSopenharmony_ci                                   uint32_t coef_hor0, uint32_t coef_hor1,
419cabdff1aSopenharmony_ci                                   uint32_t coef_ver0, uint32_t coef_ver1)
420cabdff1aSopenharmony_ci{
421cabdff1aSopenharmony_ci    ptrdiff_t stride_2 = stride << 1;
422cabdff1aSopenharmony_ci    ptrdiff_t stride_3 = stride_2 + stride;
423cabdff1aSopenharmony_ci    ptrdiff_t stride_4 = stride_2 << 1;
424cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4;
425cabdff1aSopenharmony_ci    __m256i res_hz0, res_hz1, res_vt0, res_vt1;
426cabdff1aSopenharmony_ci    __m256i mask;
427cabdff1aSopenharmony_ci    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
428cabdff1aSopenharmony_ci    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
429cabdff1aSopenharmony_ci    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
430cabdff1aSopenharmony_ci    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
431cabdff1aSopenharmony_ci    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
434cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
435cabdff1aSopenharmony_ci              src, stride_4, src1, src2, src3, src4);
436cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
437cabdff1aSopenharmony_ci              src4, src3, mask, src0, src1, src2, src3);
438cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src0, src1);
439cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
440cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
441cabdff1aSopenharmony_ci    res_hz0 = __lasx_xvadd_h(res_vt0, res_vt1);
442cabdff1aSopenharmony_ci    res_hz0 = __lasx_xvssrarni_bu_h(res_hz0, res_hz0, 6);
443cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst, 0, 0);
444cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
445cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
446cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
447cabdff1aSopenharmony_ci}
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_cistatic void avc_chroma_hv_4x8_lasx(uint8_t *src, uint8_t * dst, ptrdiff_t stride,
450cabdff1aSopenharmony_ci                                   uint32_t coef_hor0, uint32_t coef_hor1,
451cabdff1aSopenharmony_ci                                   uint32_t coef_ver0, uint32_t coef_ver1)
452cabdff1aSopenharmony_ci{
453cabdff1aSopenharmony_ci    ptrdiff_t stride_2 = stride << 1;
454cabdff1aSopenharmony_ci    ptrdiff_t stride_3 = stride_2 + stride;
455cabdff1aSopenharmony_ci    ptrdiff_t stride_4 = stride_2 << 1;
456cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
457cabdff1aSopenharmony_ci    __m256i res_hz0, res_hz1, res_hz2, res_hz3;
458cabdff1aSopenharmony_ci    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
459cabdff1aSopenharmony_ci    __m256i mask;
460cabdff1aSopenharmony_ci    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
461cabdff1aSopenharmony_ci    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
462cabdff1aSopenharmony_ci    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
463cabdff1aSopenharmony_ci    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
464cabdff1aSopenharmony_ci    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
465cabdff1aSopenharmony_ci
466cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
467cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
468cabdff1aSopenharmony_ci              src, stride_4, src1, src2, src3, src4);
469cabdff1aSopenharmony_ci    src += stride_4;
470cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
471cabdff1aSopenharmony_ci              src, stride_4, src5, src6, src7, src8);
472cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
473cabdff1aSopenharmony_ci              src4, src3, mask, src0, src1, src2, src3);
474cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_b, src5, src4, mask, src6, src5, mask, src7, src6, mask,
475cabdff1aSopenharmony_ci              src8, src7, mask, src4, src5, src6, src7);
476cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src4, src6, 0x02,
477cabdff1aSopenharmony_ci              src5, src7, 0x02, src0, src1, src4, src5);
478cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src4, coeff_hz_vec,
479cabdff1aSopenharmony_ci              src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
480cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
481cabdff1aSopenharmony_ci              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
482cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvadd_h, res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2);
483cabdff1aSopenharmony_ci    res_hz0 = __lasx_xvssrarni_bu_h(res_vt2, res_vt0, 6);
484cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst, 0, 0);
485cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
486cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
487cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
488cabdff1aSopenharmony_ci    dst += stride_4;
489cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst, 0, 2);
490cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 3);
491cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 6);
492cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 7);
493cabdff1aSopenharmony_ci}
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_cistatic void avc_chroma_hv_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
496cabdff1aSopenharmony_ci                                  uint32_t coef_hor0, uint32_t coef_hor1,
497cabdff1aSopenharmony_ci                                  uint32_t coef_ver0, uint32_t coef_ver1,
498cabdff1aSopenharmony_ci                                  int32_t height)
499cabdff1aSopenharmony_ci{
500cabdff1aSopenharmony_ci    if (8 == height) {
501cabdff1aSopenharmony_ci        avc_chroma_hv_4x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
502cabdff1aSopenharmony_ci                               coef_ver1);
503cabdff1aSopenharmony_ci    } else if (4 == height) {
504cabdff1aSopenharmony_ci        avc_chroma_hv_4x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
505cabdff1aSopenharmony_ci                               coef_ver1);
506cabdff1aSopenharmony_ci    } else if (2 == height) {
507cabdff1aSopenharmony_ci        avc_chroma_hv_4x2_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
508cabdff1aSopenharmony_ci                               coef_ver1);
509cabdff1aSopenharmony_ci    }
510cabdff1aSopenharmony_ci}
511cabdff1aSopenharmony_ci
512cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
513cabdff1aSopenharmony_ci                                   uint32_t coeff0, uint32_t coeff1)
514cabdff1aSopenharmony_ci{
515cabdff1aSopenharmony_ci    __m256i src0, src1;
516cabdff1aSopenharmony_ci    __m256i res, mask;
517cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
518cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
519cabdff1aSopenharmony_ci    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
520cabdff1aSopenharmony_ci
521cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
522cabdff1aSopenharmony_ci    src1 = __lasx_xvldx(src, stride);
523cabdff1aSopenharmony_ci    src0 = __lasx_xvshuf_b(src1, src0, mask);
524cabdff1aSopenharmony_ci    res = __lasx_xvdp2_h_bu(src0, coeff_vec);
525cabdff1aSopenharmony_ci    res = __lasx_xvslli_h(res, 3);
526cabdff1aSopenharmony_ci    res = __lasx_xvssrarni_bu_h(res, res, 6);
527cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst, 0, 0);
528cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst + stride, 0, 1);
529cabdff1aSopenharmony_ci}
530cabdff1aSopenharmony_ci
531cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
532cabdff1aSopenharmony_ci                                   uint32_t coeff0, uint32_t coeff1)
533cabdff1aSopenharmony_ci{
534cabdff1aSopenharmony_ci    ptrdiff_t stride_2 = stride << 1;
535cabdff1aSopenharmony_ci    ptrdiff_t stride_3 = stride_2 + stride;
536cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
537cabdff1aSopenharmony_ci    __m256i res, mask;
538cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
539cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
540cabdff1aSopenharmony_ci    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
543cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
544cabdff1aSopenharmony_ci    src3 = __lasx_xvldx(src, stride_3);
545cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src0, src2);
546cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
547cabdff1aSopenharmony_ci    res = __lasx_xvdp2_h_bu(src0, coeff_vec);
548cabdff1aSopenharmony_ci    res = __lasx_xvslli_h(res, 3);
549cabdff1aSopenharmony_ci    res = __lasx_xvssrarni_bu_h(res, res, 6);
550cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst, 0, 0);
551cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst + stride, 0, 1);
552cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
553cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
554cabdff1aSopenharmony_ci}
555cabdff1aSopenharmony_ci
556cabdff1aSopenharmony_cistatic void avc_chroma_hz_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
557cabdff1aSopenharmony_ci                                   uint32_t coeff0, uint32_t coeff1)
558cabdff1aSopenharmony_ci{
559cabdff1aSopenharmony_ci    ptrdiff_t stride_2 = stride << 1;
560cabdff1aSopenharmony_ci    ptrdiff_t stride_3 = stride_2 + stride;
561cabdff1aSopenharmony_ci    ptrdiff_t stride_4 = stride_2 << 1;
562cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
563cabdff1aSopenharmony_ci    __m256i res0, res1, mask;
564cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
565cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
566cabdff1aSopenharmony_ci    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
569cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
570cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
571cabdff1aSopenharmony_ci              src, stride_4, src1, src2, src3, src4);
572cabdff1aSopenharmony_ci    src += stride_4;
573cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src5, src6);
574cabdff1aSopenharmony_ci    src7 = __lasx_xvldx(src, stride_3);
575cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
576cabdff1aSopenharmony_ci              src7, src6, mask, src0, src2, src4, src6);
577cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src4, src6, 0x02, src0, src4);
578cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src4, coeff_vec, res0, res1);
579cabdff1aSopenharmony_ci    res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
580cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst, 0, 0);
581cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride, 0, 1);
582cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
583cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
584cabdff1aSopenharmony_ci    dst += stride_4;
585cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst, 0, 2);
586cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride, 0, 3);
587cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
588cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
589cabdff1aSopenharmony_ci}
590cabdff1aSopenharmony_ci
591cabdff1aSopenharmony_cistatic void avc_chroma_hz_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
592cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1,
593cabdff1aSopenharmony_ci                                  int32_t height)
594cabdff1aSopenharmony_ci{
595cabdff1aSopenharmony_ci    if (8 == height) {
596cabdff1aSopenharmony_ci        avc_chroma_hz_4x8_lasx(src, dst, stride, coeff0, coeff1);
597cabdff1aSopenharmony_ci    } else if (4 == height) {
598cabdff1aSopenharmony_ci        avc_chroma_hz_4x4_lasx(src, dst, stride, coeff0, coeff1);
599cabdff1aSopenharmony_ci    } else if (2 == height) {
600cabdff1aSopenharmony_ci        avc_chroma_hz_4x2_lasx(src, dst, stride, coeff0, coeff1);
601cabdff1aSopenharmony_ci    }
602cabdff1aSopenharmony_ci}
603cabdff1aSopenharmony_ci
604cabdff1aSopenharmony_cistatic void avc_chroma_hz_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
605cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1,
606cabdff1aSopenharmony_ci                                  int32_t height)
607cabdff1aSopenharmony_ci{
608cabdff1aSopenharmony_ci    if (4 == height) {
609cabdff1aSopenharmony_ci        avc_chroma_hz_8x4_lasx(src, dst, stride, coeff0, coeff1);
610cabdff1aSopenharmony_ci    } else if (8 == height) {
611cabdff1aSopenharmony_ci        avc_chroma_hz_8x8_lasx(src, dst, stride, coeff0, coeff1);
612cabdff1aSopenharmony_ci    } else {
613cabdff1aSopenharmony_ci        avc_chroma_hz_nonmult_lasx(src, dst, stride, coeff0, coeff1, height);
614cabdff1aSopenharmony_ci    }
615cabdff1aSopenharmony_ci}
616cabdff1aSopenharmony_ci
617cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
618cabdff1aSopenharmony_ci                                   uint32_t coeff0, uint32_t coeff1)
619cabdff1aSopenharmony_ci{
620cabdff1aSopenharmony_ci    __m256i src0, src1, src2;
621cabdff1aSopenharmony_ci    __m256i tmp0, tmp1;
622cabdff1aSopenharmony_ci    __m256i res;
623cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
624cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
625cabdff1aSopenharmony_ci    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
626cabdff1aSopenharmony_ci
627cabdff1aSopenharmony_ci    src0 = __lasx_xvld(src, 0);
628cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride << 1, src1, src2);
629cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, tmp0, tmp1);
630cabdff1aSopenharmony_ci    tmp0 = __lasx_xvilvl_d(tmp1, tmp0);
631cabdff1aSopenharmony_ci    res  = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
632cabdff1aSopenharmony_ci    res  = __lasx_xvslli_h(res, 3);
633cabdff1aSopenharmony_ci    res  = __lasx_xvssrarni_bu_h(res, res, 6);
634cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst, 0, 0);
635cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst + stride, 0, 1);
636cabdff1aSopenharmony_ci}
637cabdff1aSopenharmony_ci
638cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
639cabdff1aSopenharmony_ci                                   uint32_t coeff0, uint32_t coeff1)
640cabdff1aSopenharmony_ci{
641cabdff1aSopenharmony_ci    ptrdiff_t stride_2 = stride << 1;
642cabdff1aSopenharmony_ci    ptrdiff_t stride_3 = stride_2 + stride;
643cabdff1aSopenharmony_ci    ptrdiff_t stride_4 = stride_2 << 1;
644cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4;
645cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3;
646cabdff1aSopenharmony_ci    __m256i res;
647cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
648cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
649cabdff1aSopenharmony_ci    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
650cabdff1aSopenharmony_ci
651cabdff1aSopenharmony_ci    src0 = __lasx_xvld(src, 0);
652cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
653cabdff1aSopenharmony_ci              src, stride_4, src1, src2, src3, src4);
654cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
655cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
656cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
657cabdff1aSopenharmony_ci    tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
658cabdff1aSopenharmony_ci    res = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
659cabdff1aSopenharmony_ci    res = __lasx_xvslli_h(res, 3);
660cabdff1aSopenharmony_ci    res = __lasx_xvssrarni_bu_h(res, res, 6);
661cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst, 0, 0);
662cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst + stride, 0, 1);
663cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
664cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
665cabdff1aSopenharmony_ci}
666cabdff1aSopenharmony_ci
667cabdff1aSopenharmony_cistatic void avc_chroma_vt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
668cabdff1aSopenharmony_ci                                   uint32_t coeff0, uint32_t coeff1)
669cabdff1aSopenharmony_ci{
670cabdff1aSopenharmony_ci    ptrdiff_t stride_2 = stride << 1;
671cabdff1aSopenharmony_ci    ptrdiff_t stride_3 = stride_2 + stride;
672cabdff1aSopenharmony_ci    ptrdiff_t stride_4 = stride_2 << 1;
673cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
674cabdff1aSopenharmony_ci    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
675cabdff1aSopenharmony_ci    __m256i res0, res1;
676cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
677cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
678cabdff1aSopenharmony_ci    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
679cabdff1aSopenharmony_ci
680cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
681cabdff1aSopenharmony_ci    src0 = __lasx_xvld(src, 0);
682cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
683cabdff1aSopenharmony_ci              src, stride_4, src1, src2, src3, src4);
684cabdff1aSopenharmony_ci    src += stride_4;
685cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
686cabdff1aSopenharmony_ci              src, stride_4, src5, src6, src7, src8);
687cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
688cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
689cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
690cabdff1aSopenharmony_ci              tmp4, tmp5, tmp6, tmp7);
691cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
692cabdff1aSopenharmony_ci              tmp0, tmp2, tmp4, tmp6);
693cabdff1aSopenharmony_ci    tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
694cabdff1aSopenharmony_ci    tmp4 = __lasx_xvpermi_q(tmp4, tmp6, 0x02);
695cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, coeff_vec, tmp4, coeff_vec, res0, res1);
696cabdff1aSopenharmony_ci    res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
697cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst, 0, 0);
698cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride, 0, 1);
699cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
700cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
701cabdff1aSopenharmony_ci    dst += stride_4;
702cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst, 0, 2);
703cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride, 0, 3);
704cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
705cabdff1aSopenharmony_ci    __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
706cabdff1aSopenharmony_ci}
707cabdff1aSopenharmony_ci
708cabdff1aSopenharmony_cistatic void avc_chroma_vt_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
709cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1,
710cabdff1aSopenharmony_ci                                  int32_t height)
711cabdff1aSopenharmony_ci{
712cabdff1aSopenharmony_ci    if (8 == height) {
713cabdff1aSopenharmony_ci        avc_chroma_vt_4x8_lasx(src, dst, stride, coeff0, coeff1);
714cabdff1aSopenharmony_ci    } else if (4 == height) {
715cabdff1aSopenharmony_ci        avc_chroma_vt_4x4_lasx(src, dst, stride, coeff0, coeff1);
716cabdff1aSopenharmony_ci    } else if (2 == height) {
717cabdff1aSopenharmony_ci        avc_chroma_vt_4x2_lasx(src, dst, stride, coeff0, coeff1);
718cabdff1aSopenharmony_ci    }
719cabdff1aSopenharmony_ci}
720cabdff1aSopenharmony_ci
721cabdff1aSopenharmony_cistatic void avc_chroma_vt_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
722cabdff1aSopenharmony_ci                                  uint32_t coeff0, uint32_t coeff1,
723cabdff1aSopenharmony_ci                                  int32_t height)
724cabdff1aSopenharmony_ci{
725cabdff1aSopenharmony_ci    if (4 == height) {
726cabdff1aSopenharmony_ci        avc_chroma_vt_8x4_lasx(src, dst, stride, coeff0, coeff1);
727cabdff1aSopenharmony_ci    } else if (8 == height) {
728cabdff1aSopenharmony_ci        avc_chroma_vt_8x8_lasx(src, dst, stride, coeff0, coeff1);
729cabdff1aSopenharmony_ci    }
730cabdff1aSopenharmony_ci}
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_cistatic void copy_width4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
733cabdff1aSopenharmony_ci                             int32_t height)
734cabdff1aSopenharmony_ci{
735cabdff1aSopenharmony_ci    uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
736cabdff1aSopenharmony_ci
737cabdff1aSopenharmony_ci    if (8 == height) {
738cabdff1aSopenharmony_ci        ptrdiff_t stride_2, stride_3, stride_4;
739cabdff1aSopenharmony_ci
740cabdff1aSopenharmony_ci        __asm__ volatile (
741cabdff1aSopenharmony_ci        "slli.d   %[stride_2],     %[stride],     1             \n\t"
742cabdff1aSopenharmony_ci        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
743cabdff1aSopenharmony_ci        "slli.d   %[stride_4],     %[stride_2],   1             \n\t"
744cabdff1aSopenharmony_ci        "ld.wu    %[tp0],          %[src],        0             \n\t"
745cabdff1aSopenharmony_ci        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
746cabdff1aSopenharmony_ci        "ldx.wu   %[tp2],          %[src],        %[stride_2]   \n\t"
747cabdff1aSopenharmony_ci        "ldx.wu   %[tp3],          %[src],        %[stride_3]   \n\t"
748cabdff1aSopenharmony_ci        "add.d    %[src],          %[src],        %[stride_4]   \n\t"
749cabdff1aSopenharmony_ci        "ld.wu    %[tp4],          %[src],        0             \n\t"
750cabdff1aSopenharmony_ci        "ldx.wu   %[tp5],          %[src],        %[stride]     \n\t"
751cabdff1aSopenharmony_ci        "ldx.wu   %[tp6],          %[src],        %[stride_2]   \n\t"
752cabdff1aSopenharmony_ci        "ldx.wu   %[tp7],          %[src],        %[stride_3]   \n\t"
753cabdff1aSopenharmony_ci        "st.w     %[tp0],          %[dst],        0             \n\t"
754cabdff1aSopenharmony_ci        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
755cabdff1aSopenharmony_ci        "stx.w    %[tp2],          %[dst],        %[stride_2]   \n\t"
756cabdff1aSopenharmony_ci        "stx.w    %[tp3],          %[dst],        %[stride_3]   \n\t"
757cabdff1aSopenharmony_ci        "add.d    %[dst],          %[dst],        %[stride_4]   \n\t"
758cabdff1aSopenharmony_ci        "st.w     %[tp4],          %[dst],        0             \n\t"
759cabdff1aSopenharmony_ci        "stx.w    %[tp5],          %[dst],        %[stride]     \n\t"
760cabdff1aSopenharmony_ci        "stx.w    %[tp6],          %[dst],        %[stride_2]   \n\t"
761cabdff1aSopenharmony_ci        "stx.w    %[tp7],          %[dst],        %[stride_3]   \n\t"
762cabdff1aSopenharmony_ci        : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), [stride_4]"+&r"(stride_4),
763cabdff1aSopenharmony_ci          [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
764cabdff1aSopenharmony_ci          [tp2]"+&r"(tp2), [tp3]"+&r"(tp3), [tp4]"+&r"(tp4), [tp5]"+&r"(tp5),
765cabdff1aSopenharmony_ci          [tp6]"+&r"(tp6), [tp7]"+&r"(tp7)
766cabdff1aSopenharmony_ci        : [stride]"r"(stride)
767cabdff1aSopenharmony_ci        : "memory"
768cabdff1aSopenharmony_ci        );
769cabdff1aSopenharmony_ci    } else if (4 == height) {
770cabdff1aSopenharmony_ci        ptrdiff_t stride_2, stride_3;
771cabdff1aSopenharmony_ci
772cabdff1aSopenharmony_ci        __asm__ volatile (
773cabdff1aSopenharmony_ci        "slli.d   %[stride_2],     %[stride],     1             \n\t"
774cabdff1aSopenharmony_ci        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
775cabdff1aSopenharmony_ci        "ld.wu    %[tp0],          %[src],        0             \n\t"
776cabdff1aSopenharmony_ci        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
777cabdff1aSopenharmony_ci        "ldx.wu   %[tp2],          %[src],        %[stride_2]   \n\t"
778cabdff1aSopenharmony_ci        "ldx.wu   %[tp3],          %[src],        %[stride_3]   \n\t"
779cabdff1aSopenharmony_ci        "st.w     %[tp0],          %[dst],        0             \n\t"
780cabdff1aSopenharmony_ci        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
781cabdff1aSopenharmony_ci        "stx.w    %[tp2],          %[dst],        %[stride_2]   \n\t"
782cabdff1aSopenharmony_ci        "stx.w    %[tp3],          %[dst],        %[stride_3]   \n\t"
783cabdff1aSopenharmony_ci        : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3),
784cabdff1aSopenharmony_ci          [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
785cabdff1aSopenharmony_ci          [tp2]"+&r"(tp2), [tp3]"+&r"(tp3)
786cabdff1aSopenharmony_ci        : [stride]"r"(stride)
787cabdff1aSopenharmony_ci        : "memory"
788cabdff1aSopenharmony_ci        );
789cabdff1aSopenharmony_ci    } else if (2 == height) {
790cabdff1aSopenharmony_ci        __asm__ volatile (
791cabdff1aSopenharmony_ci        "ld.wu    %[tp0],          %[src],        0             \n\t"
792cabdff1aSopenharmony_ci        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
793cabdff1aSopenharmony_ci        "st.w     %[tp0],          %[dst],        0             \n\t"
794cabdff1aSopenharmony_ci        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
795cabdff1aSopenharmony_ci        : [tp0]"+&r"(tp0), [tp1]"+&r"(tp1)
796cabdff1aSopenharmony_ci        : [src]"r"(src), [dst]"r"(dst), [stride]"r"(stride)
797cabdff1aSopenharmony_ci        : "memory"
798cabdff1aSopenharmony_ci        );
799cabdff1aSopenharmony_ci    }
800cabdff1aSopenharmony_ci}
801cabdff1aSopenharmony_ci
802cabdff1aSopenharmony_cistatic void copy_width8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
803cabdff1aSopenharmony_ci                             int32_t height)
804cabdff1aSopenharmony_ci{
805cabdff1aSopenharmony_ci    if (8 == height) {
806cabdff1aSopenharmony_ci        copy_width8x8_lasx(src, dst, stride);
807cabdff1aSopenharmony_ci    } else if (4 == height) {
808cabdff1aSopenharmony_ci        copy_width8x4_lasx(src, dst, stride);
809cabdff1aSopenharmony_ci    }
810cabdff1aSopenharmony_ci}
811cabdff1aSopenharmony_ci
812cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
813cabdff1aSopenharmony_ci                                 int height, int x, int y)
814cabdff1aSopenharmony_ci{
815cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
816cabdff1aSopenharmony_ci
817cabdff1aSopenharmony_ci    if(x && y) {
818cabdff1aSopenharmony_ci        avc_chroma_hv_4w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
819cabdff1aSopenharmony_ci    } else if (x) {
820cabdff1aSopenharmony_ci        avc_chroma_hz_4w_lasx(src, dst, stride, x, (8 - x), height);
821cabdff1aSopenharmony_ci    } else if (y) {
822cabdff1aSopenharmony_ci        avc_chroma_vt_4w_lasx(src, dst, stride, y, (8 - y), height);
823cabdff1aSopenharmony_ci    } else {
824cabdff1aSopenharmony_ci        copy_width4_lasx(src, dst, stride, height);
825cabdff1aSopenharmony_ci    }
826cabdff1aSopenharmony_ci}
827cabdff1aSopenharmony_ci
828cabdff1aSopenharmony_civoid ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
829cabdff1aSopenharmony_ci                                 int height, int x, int y)
830cabdff1aSopenharmony_ci{
831cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
832cabdff1aSopenharmony_ci
833cabdff1aSopenharmony_ci    if (!(x || y)) {
834cabdff1aSopenharmony_ci        copy_width8_lasx(src, dst, stride, height);
835cabdff1aSopenharmony_ci    } else if (x && y) {
836cabdff1aSopenharmony_ci        avc_chroma_hv_8w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
837cabdff1aSopenharmony_ci    } else if (x) {
838cabdff1aSopenharmony_ci        avc_chroma_hz_8w_lasx(src, dst, stride, x, (8 - x), height);
839cabdff1aSopenharmony_ci    } else {
840cabdff1aSopenharmony_ci        avc_chroma_vt_8w_lasx(src, dst, stride, y, (8 - y), height);
841cabdff1aSopenharmony_ci    }
842cabdff1aSopenharmony_ci}
843cabdff1aSopenharmony_ci
844cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(uint8_t *src,
845cabdff1aSopenharmony_ci                             uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
846cabdff1aSopenharmony_ci                             uint32_t coef_hor1, uint32_t coef_ver0,
847cabdff1aSopenharmony_ci                             uint32_t coef_ver1)
848cabdff1aSopenharmony_ci{
849cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
850cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
851cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
852cabdff1aSopenharmony_ci    __m256i tp0, tp1, tp2, tp3;
853cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, out;
854cabdff1aSopenharmony_ci    __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
855cabdff1aSopenharmony_ci    __m256i mask;
856cabdff1aSopenharmony_ci    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
857cabdff1aSopenharmony_ci    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
858cabdff1aSopenharmony_ci    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
859cabdff1aSopenharmony_ci    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
860cabdff1aSopenharmony_ci    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
861cabdff1aSopenharmony_ci
862cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
863cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
864cabdff1aSopenharmony_ci              src1, src2, src3, src4);
865cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
866cabdff1aSopenharmony_ci    src0 = __lasx_xvshuf_b(src0, src0, mask);
867cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
868cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
869cabdff1aSopenharmony_ci    res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
870cabdff1aSopenharmony_ci    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
871cabdff1aSopenharmony_ci    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
872cabdff1aSopenharmony_ci    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
873cabdff1aSopenharmony_ci    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
874cabdff1aSopenharmony_ci    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
875cabdff1aSopenharmony_ci    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
876cabdff1aSopenharmony_ci    out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
877cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
878cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
879cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
880cabdff1aSopenharmony_ci    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
881cabdff1aSopenharmony_ci    out = __lasx_xvavgr_bu(out, tp0);
882cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst, 0, 0);
883cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride, 0, 2);
884cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
885cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
886cabdff1aSopenharmony_ci}
887cabdff1aSopenharmony_ci
888cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(uint8_t *src,
889cabdff1aSopenharmony_ci                             uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
890cabdff1aSopenharmony_ci                             uint32_t coef_hor1, uint32_t coef_ver0,
891cabdff1aSopenharmony_ci                             uint32_t coef_ver1)
892cabdff1aSopenharmony_ci{
893cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
894cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
895cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
896cabdff1aSopenharmony_ci    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
897cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
898cabdff1aSopenharmony_ci    __m256i out0, out1;
899cabdff1aSopenharmony_ci    __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
900cabdff1aSopenharmony_ci    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
901cabdff1aSopenharmony_ci    __m256i mask;
902cabdff1aSopenharmony_ci    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
903cabdff1aSopenharmony_ci    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
904cabdff1aSopenharmony_ci    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
905cabdff1aSopenharmony_ci    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
906cabdff1aSopenharmony_ci    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
907cabdff1aSopenharmony_ci
908cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
909cabdff1aSopenharmony_ci    src += stride;
910cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
911cabdff1aSopenharmony_ci              src1, src2, src3, src4);
912cabdff1aSopenharmony_ci    src += stride_4x;
913cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
914cabdff1aSopenharmony_ci              src5, src6, src7, src8);
915cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
916cabdff1aSopenharmony_ci              src8, src7, 0x20, src1, src3, src5, src7);
917cabdff1aSopenharmony_ci    src0 = __lasx_xvshuf_b(src0, src0, mask);
918cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
919cabdff1aSopenharmony_ci              src7, mask, src1, src3, src5, src7);
920cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
921cabdff1aSopenharmony_ci              coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
922cabdff1aSopenharmony_ci    res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
923cabdff1aSopenharmony_ci    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
924cabdff1aSopenharmony_ci    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
925cabdff1aSopenharmony_ci    res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
926cabdff1aSopenharmony_ci    res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
927cabdff1aSopenharmony_ci    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
928cabdff1aSopenharmony_ci    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
929cabdff1aSopenharmony_ci    res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
930cabdff1aSopenharmony_ci    res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
931cabdff1aSopenharmony_ci    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
932cabdff1aSopenharmony_ci    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
933cabdff1aSopenharmony_ci    res_vt2 = __lasx_xvmadd_h(res_vt2, res_hz2, coeff_vt_vec1);
934cabdff1aSopenharmony_ci    res_vt3 = __lasx_xvmadd_h(res_vt3, res_hz3, coeff_vt_vec1);
935cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6,
936cabdff1aSopenharmony_ci              out0, out1);
937cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
938cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
939cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
940cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
941cabdff1aSopenharmony_ci    dst += stride_4x;
942cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
943cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
944cabdff1aSopenharmony_ci    dst -= stride_4x;
945cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
946cabdff1aSopenharmony_ci    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
947cabdff1aSopenharmony_ci    out0 = __lasx_xvavgr_bu(out0, dst0);
948cabdff1aSopenharmony_ci    out1 = __lasx_xvavgr_bu(out1, dst1);
949cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst, 0, 0);
950cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
951cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
952cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
953cabdff1aSopenharmony_ci    dst += stride_4x;
954cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst, 0, 0);
955cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
956cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
957cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
958cabdff1aSopenharmony_ci}
959cabdff1aSopenharmony_ci
960cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_and_aver_dst_8x4_lasx(uint8_t *src,
961cabdff1aSopenharmony_ci                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
962cabdff1aSopenharmony_ci                             uint32_t coeff1)
963cabdff1aSopenharmony_ci{
964cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
965cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
966cabdff1aSopenharmony_ci    __m256i tp0, tp1, tp2, tp3;
967cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, out;
968cabdff1aSopenharmony_ci    __m256i res0, res1;
969cabdff1aSopenharmony_ci    __m256i mask;
970cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
971cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
972cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
975cabdff1aSopenharmony_ci    mask = __lasx_xvld(chroma_mask_arr, 0);
976cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
977cabdff1aSopenharmony_ci              src0, src1, src2, src3);
978cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
979cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
980cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
981cabdff1aSopenharmony_ci    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
982cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
983cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
984cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
985cabdff1aSopenharmony_ci    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
986cabdff1aSopenharmony_ci    out = __lasx_xvavgr_bu(out, tp0);
987cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst, 0, 0);
988cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride, 0, 2);
989cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
990cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
991cabdff1aSopenharmony_ci}
992cabdff1aSopenharmony_ci
993cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(uint8_t *src,
994cabdff1aSopenharmony_ci                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
995cabdff1aSopenharmony_ci                             uint32_t coeff1)
996cabdff1aSopenharmony_ci{
997cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
998cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
999cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1000cabdff1aSopenharmony_ci    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
1001cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
1002cabdff1aSopenharmony_ci    __m256i out0, out1;
1003cabdff1aSopenharmony_ci    __m256i res0, res1, res2, res3;
1004cabdff1aSopenharmony_ci    __m256i mask;
1005cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
1006cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
1007cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
1008cabdff1aSopenharmony_ci
1009cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
1010cabdff1aSopenharmony_ci    mask = __lasx_xvld(chroma_mask_arr, 0);
1011cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
1012cabdff1aSopenharmony_ci              src0, src1, src2, src3);
1013cabdff1aSopenharmony_ci    src += stride_4x;
1014cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
1015cabdff1aSopenharmony_ci              src4, src5, src6, src7);
1016cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
1017cabdff1aSopenharmony_ci              src7, src6, 0x20, src0, src2, src4, src6);
1018cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4,
1019cabdff1aSopenharmony_ci              mask, src6, src6, mask, src0, src2, src4, src6);
1020cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
1021cabdff1aSopenharmony_ci              coeff_vec, res0, res1, res2, res3);
1022cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
1023cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
1024cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
1025cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
1026cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
1027cabdff1aSopenharmony_ci    dst += stride_4x;
1028cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
1029cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
1030cabdff1aSopenharmony_ci    dst -= stride_4x;
1031cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
1032cabdff1aSopenharmony_ci    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
1033cabdff1aSopenharmony_ci    out0 = __lasx_xvavgr_bu(out0, dst0);
1034cabdff1aSopenharmony_ci    out1 = __lasx_xvavgr_bu(out1, dst1);
1035cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst, 0, 0);
1036cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
1037cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
1038cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
1039cabdff1aSopenharmony_ci    dst += stride_4x;
1040cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst, 0, 0);
1041cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
1042cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
1043cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
1044cabdff1aSopenharmony_ci}
1045cabdff1aSopenharmony_ci
1046cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_vt_and_aver_dst_8x4_lasx(uint8_t *src,
1047cabdff1aSopenharmony_ci                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
1048cabdff1aSopenharmony_ci                             uint32_t coeff1)
1049cabdff1aSopenharmony_ci{
1050cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1051cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1052cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1053cabdff1aSopenharmony_ci    __m256i tp0, tp1, tp2, tp3;
1054cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, out;
1055cabdff1aSopenharmony_ci    __m256i res0, res1;
1056cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
1057cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
1058cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
1059cabdff1aSopenharmony_ci
1060cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
1061cabdff1aSopenharmony_ci    src0 = __lasx_xvld(src, 0);
1062cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
1063cabdff1aSopenharmony_ci              src1, src2, src3, src4);
1064cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
1065cabdff1aSopenharmony_ci              src4, src3, 0x20, src0, src1, src2, src3);
1066cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
1067cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
1068cabdff1aSopenharmony_ci    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
1069cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
1070cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
1071cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
1072cabdff1aSopenharmony_ci    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
1073cabdff1aSopenharmony_ci    out = __lasx_xvavgr_bu(out, tp0);
1074cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst, 0, 0);
1075cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride, 0, 2);
1076cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
1077cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
1078cabdff1aSopenharmony_ci}
1079cabdff1aSopenharmony_ci
1080cabdff1aSopenharmony_cistatic av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(uint8_t *src,
1081cabdff1aSopenharmony_ci                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
1082cabdff1aSopenharmony_ci                             uint32_t coeff1)
1083cabdff1aSopenharmony_ci{
1084cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1085cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1086cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1087cabdff1aSopenharmony_ci    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
1088cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
1089cabdff1aSopenharmony_ci    __m256i out0, out1;
1090cabdff1aSopenharmony_ci    __m256i res0, res1, res2, res3;
1091cabdff1aSopenharmony_ci    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
1092cabdff1aSopenharmony_ci    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
1093cabdff1aSopenharmony_ci    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
1094cabdff1aSopenharmony_ci
1095cabdff1aSopenharmony_ci    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
1096cabdff1aSopenharmony_ci    src0 = __lasx_xvld(src, 0);
1097cabdff1aSopenharmony_ci    src += stride;
1098cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
1099cabdff1aSopenharmony_ci              src1, src2, src3, src4);
1100cabdff1aSopenharmony_ci    src += stride_4x;
1101cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
1102cabdff1aSopenharmony_ci              src5, src6, src7, src8);
1103cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
1104cabdff1aSopenharmony_ci              src4, src3, 0x20, src0, src1, src2, src3);
1105cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
1106cabdff1aSopenharmony_ci              src8, src7, 0x20, src4, src5, src6, src7);
1107cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
1108cabdff1aSopenharmony_ci              src0, src2, src4, src6);
1109cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
1110cabdff1aSopenharmony_ci              coeff_vec, res0, res1, res2, res3);
1111cabdff1aSopenharmony_ci    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
1112cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
1113cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
1114cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
1115cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
1116cabdff1aSopenharmony_ci    dst += stride_4x;
1117cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
1118cabdff1aSopenharmony_ci              tp0, tp1, tp2, tp3);
1119cabdff1aSopenharmony_ci    dst -= stride_4x;
1120cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
1121cabdff1aSopenharmony_ci    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
1122cabdff1aSopenharmony_ci    out0 = __lasx_xvavgr_bu(out0, dst0);
1123cabdff1aSopenharmony_ci    out1 = __lasx_xvavgr_bu(out1, dst1);
1124cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst, 0, 0);
1125cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
1126cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
1127cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
1128cabdff1aSopenharmony_ci    dst += stride_4x;
1129cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst, 0, 0);
1130cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
1131cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
1132cabdff1aSopenharmony_ci    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
1133cabdff1aSopenharmony_ci}
1134cabdff1aSopenharmony_ci
1135cabdff1aSopenharmony_cistatic av_always_inline void avg_width8x8_lasx(uint8_t *src, uint8_t *dst,
1136cabdff1aSopenharmony_ci                                               ptrdiff_t stride)
1137cabdff1aSopenharmony_ci{
1138cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
1139cabdff1aSopenharmony_ci    __m256i dst0, dst1, dst2, dst3;
1140cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1141cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1142cabdff1aSopenharmony_ci    ptrdiff_t stride_4x = stride << 2;
1143cabdff1aSopenharmony_ci
1144cabdff1aSopenharmony_ci    src0 = __lasx_xvldrepl_d(src, 0);
1145cabdff1aSopenharmony_ci    src1 = __lasx_xvldrepl_d(src + stride, 0);
1146cabdff1aSopenharmony_ci    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
1147cabdff1aSopenharmony_ci    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
1148cabdff1aSopenharmony_ci    dst0 = __lasx_xvldrepl_d(dst, 0);
1149cabdff1aSopenharmony_ci    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
1150cabdff1aSopenharmony_ci    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
1151cabdff1aSopenharmony_ci    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
1152cabdff1aSopenharmony_ci    src0 = __lasx_xvpackev_d(src1,src0);
1153cabdff1aSopenharmony_ci    src2 = __lasx_xvpackev_d(src3,src2);
1154cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
1155cabdff1aSopenharmony_ci    dst0 = __lasx_xvpackev_d(dst1,dst0);
1156cabdff1aSopenharmony_ci    dst2 = __lasx_xvpackev_d(dst3,dst2);
1157cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
1158cabdff1aSopenharmony_ci    dst0 = __lasx_xvavgr_bu(src0, dst0);
1159cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
1160cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1161cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1162cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1163cabdff1aSopenharmony_ci
1164cabdff1aSopenharmony_ci    src += stride_4x;
1165cabdff1aSopenharmony_ci    dst += stride_4x;
1166cabdff1aSopenharmony_ci    src0 = __lasx_xvldrepl_d(src, 0);
1167cabdff1aSopenharmony_ci    src1 = __lasx_xvldrepl_d(src + stride, 0);
1168cabdff1aSopenharmony_ci    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
1169cabdff1aSopenharmony_ci    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
1170cabdff1aSopenharmony_ci    dst0 = __lasx_xvldrepl_d(dst, 0);
1171cabdff1aSopenharmony_ci    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
1172cabdff1aSopenharmony_ci    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
1173cabdff1aSopenharmony_ci    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
1174cabdff1aSopenharmony_ci    src0 = __lasx_xvpackev_d(src1,src0);
1175cabdff1aSopenharmony_ci    src2 = __lasx_xvpackev_d(src3,src2);
1176cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
1177cabdff1aSopenharmony_ci    dst0 = __lasx_xvpackev_d(dst1,dst0);
1178cabdff1aSopenharmony_ci    dst2 = __lasx_xvpackev_d(dst3,dst2);
1179cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
1180cabdff1aSopenharmony_ci    dst0 = __lasx_xvavgr_bu(src0, dst0);
1181cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
1182cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1183cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1184cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1185cabdff1aSopenharmony_ci}
1186cabdff1aSopenharmony_ci
1187cabdff1aSopenharmony_cistatic av_always_inline void avg_width8x4_lasx(uint8_t *src, uint8_t *dst,
1188cabdff1aSopenharmony_ci                                               ptrdiff_t stride)
1189cabdff1aSopenharmony_ci{
1190cabdff1aSopenharmony_ci    __m256i src0, src1, src2, src3;
1191cabdff1aSopenharmony_ci    __m256i dst0, dst1, dst2, dst3;
1192cabdff1aSopenharmony_ci    ptrdiff_t stride_2x = stride << 1;
1193cabdff1aSopenharmony_ci    ptrdiff_t stride_3x = stride_2x + stride;
1194cabdff1aSopenharmony_ci
1195cabdff1aSopenharmony_ci    src0 = __lasx_xvldrepl_d(src, 0);
1196cabdff1aSopenharmony_ci    src1 = __lasx_xvldrepl_d(src + stride, 0);
1197cabdff1aSopenharmony_ci    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
1198cabdff1aSopenharmony_ci    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
1199cabdff1aSopenharmony_ci    dst0 = __lasx_xvldrepl_d(dst, 0);
1200cabdff1aSopenharmony_ci    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
1201cabdff1aSopenharmony_ci    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
1202cabdff1aSopenharmony_ci    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
1203cabdff1aSopenharmony_ci    src0 = __lasx_xvpackev_d(src1,src0);
1204cabdff1aSopenharmony_ci    src2 = __lasx_xvpackev_d(src3,src2);
1205cabdff1aSopenharmony_ci    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
1206cabdff1aSopenharmony_ci    dst0 = __lasx_xvpackev_d(dst1,dst0);
1207cabdff1aSopenharmony_ci    dst2 = __lasx_xvpackev_d(dst3,dst2);
1208cabdff1aSopenharmony_ci    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
1209cabdff1aSopenharmony_ci    dst0 = __lasx_xvavgr_bu(src0, dst0);
1210cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst, 0, 0);
1211cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1212cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1213cabdff1aSopenharmony_ci    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1214cabdff1aSopenharmony_ci}
1215cabdff1aSopenharmony_ci
1216cabdff1aSopenharmony_cistatic void avc_chroma_hv_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst,
1217cabdff1aSopenharmony_ci                                               ptrdiff_t stride,
1218cabdff1aSopenharmony_ci                                               uint32_t coef_hor0,
1219cabdff1aSopenharmony_ci                                               uint32_t coef_hor1,
1220cabdff1aSopenharmony_ci                                               uint32_t coef_ver0,
1221cabdff1aSopenharmony_ci                                               uint32_t coef_ver1,
1222cabdff1aSopenharmony_ci                                               int32_t height)
1223cabdff1aSopenharmony_ci{
1224cabdff1aSopenharmony_ci    if (4 == height) {
1225cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_8x4_lasx(src, dst, stride, coef_hor0,
1226cabdff1aSopenharmony_ci                                            coef_hor1, coef_ver0, coef_ver1);
1227cabdff1aSopenharmony_ci    } else if (8 == height) {
1228cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_8x8_lasx(src, dst, stride, coef_hor0,
1229cabdff1aSopenharmony_ci                                            coef_hor1, coef_ver0, coef_ver1);
1230cabdff1aSopenharmony_ci    }
1231cabdff1aSopenharmony_ci}
1232cabdff1aSopenharmony_ci
1233cabdff1aSopenharmony_cistatic void avc_chroma_hz_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst,
1234cabdff1aSopenharmony_ci                                               ptrdiff_t stride, uint32_t coeff0,
1235cabdff1aSopenharmony_ci                                               uint32_t coeff1, int32_t height)
1236cabdff1aSopenharmony_ci{
1237cabdff1aSopenharmony_ci    if (4 == height) {
1238cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
1239cabdff1aSopenharmony_ci    } else if (8 == height) {
1240cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
1241cabdff1aSopenharmony_ci    }
1242cabdff1aSopenharmony_ci}
1243cabdff1aSopenharmony_ci
1244cabdff1aSopenharmony_cistatic void avc_chroma_vt_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst,
1245cabdff1aSopenharmony_ci                                               ptrdiff_t stride, uint32_t coeff0,
1246cabdff1aSopenharmony_ci                                               uint32_t coeff1, int32_t height)
1247cabdff1aSopenharmony_ci{
1248cabdff1aSopenharmony_ci    if (4 == height) {
1249cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
1250cabdff1aSopenharmony_ci    } else if (8 == height) {
1251cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
1252cabdff1aSopenharmony_ci    }
1253cabdff1aSopenharmony_ci}
1254cabdff1aSopenharmony_ci
1255cabdff1aSopenharmony_cistatic void avg_width8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1256cabdff1aSopenharmony_ci                            int32_t height)
1257cabdff1aSopenharmony_ci{
1258cabdff1aSopenharmony_ci    if (8 == height) {
1259cabdff1aSopenharmony_ci        avg_width8x8_lasx(src, dst, stride);
1260cabdff1aSopenharmony_ci    } else if (4 == height) {
1261cabdff1aSopenharmony_ci        avg_width8x4_lasx(src, dst, stride);
1262cabdff1aSopenharmony_ci    }
1263cabdff1aSopenharmony_ci}
1264cabdff1aSopenharmony_ci
1265cabdff1aSopenharmony_civoid ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
1266cabdff1aSopenharmony_ci                                 int height, int x, int y)
1267cabdff1aSopenharmony_ci{
1268cabdff1aSopenharmony_ci    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1269cabdff1aSopenharmony_ci
1270cabdff1aSopenharmony_ci    if (!(x || y)) {
1271cabdff1aSopenharmony_ci        avg_width8_lasx(src, dst, stride, height);
1272cabdff1aSopenharmony_ci    } else if (x && y) {
1273cabdff1aSopenharmony_ci        avc_chroma_hv_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), y,
1274cabdff1aSopenharmony_ci                                           (8 - y), height);
1275cabdff1aSopenharmony_ci    } else if (x) {
1276cabdff1aSopenharmony_ci        avc_chroma_hz_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), height);
1277cabdff1aSopenharmony_ci    } else {
1278cabdff1aSopenharmony_ci        avc_chroma_vt_and_aver_dst_8w_lasx(src, dst, stride, y, (8 - y), height);
1279cabdff1aSopenharmony_ci    }
1280cabdff1aSopenharmony_ci}
1281