1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2022 Loongson Technology Corporation Limited
3cabdff1aSopenharmony_ci * Contributed by Lu Wang <wanglu@loongson.cn>
4cabdff1aSopenharmony_ci *                Hao Chen <chenhao@loongson.cn>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
24cabdff1aSopenharmony_ci#include "hevcdsp_lsx.h"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27cabdff1aSopenharmony_ci    /* 8 width cases */
28cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30cabdff1aSopenharmony_ci};
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_cistatic av_always_inline __m128i
33cabdff1aSopenharmony_cihevc_bi_rnd_clip(__m128i in0, __m128i vec0, __m128i in1, __m128i vec1)
34cabdff1aSopenharmony_ci{
35cabdff1aSopenharmony_ci    __m128i out;
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci    vec0 = __lsx_vsadd_h(in0, vec0);
38cabdff1aSopenharmony_ci    vec1 = __lsx_vsadd_h(in1, vec1);
39cabdff1aSopenharmony_ci    out  = __lsx_vssrarni_bu_h(vec1, vec0, 7);
40cabdff1aSopenharmony_ci    return out;
41cabdff1aSopenharmony_ci}
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci/* hevc_bi_copy: dst = av_clip_uint8((src0 << 6 + src1) >> 7) */
44cabdff1aSopenharmony_cistatic
45cabdff1aSopenharmony_civoid hevc_bi_copy_4w_lsx(uint8_t *src0_ptr, int32_t src_stride,
46cabdff1aSopenharmony_ci                         int16_t *src1_ptr, int32_t src2_stride,
47cabdff1aSopenharmony_ci                         uint8_t *dst, int32_t dst_stride, int32_t height)
48cabdff1aSopenharmony_ci{
49cabdff1aSopenharmony_ci    int32_t loop_cnt = height >> 3;
50cabdff1aSopenharmony_ci    int32_t res = (height & 0x07) >> 1;
51cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
52cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
53cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
54cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
55cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 1);
56cabdff1aSopenharmony_ci    int32_t src2_stride_4x = (src2_stride << 2);
57cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
58cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
59cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride;
60cabdff1aSopenharmony_ci    __m128i src0, src1;
61cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
62cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
63cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3;
64cabdff1aSopenharmony_ci    __m128i reg0, reg1, reg2, reg3;
65cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ci    for (;loop_cnt--;) {
68cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
69cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
70cabdff1aSopenharmony_ci        reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
71cabdff1aSopenharmony_ci        reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
72cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
73cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
74cabdff1aSopenharmony_ci        src0 = __lsx_vilvl_d(tmp1, tmp0);
75cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
76cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
77cabdff1aSopenharmony_ci        reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
78cabdff1aSopenharmony_ci        reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
79cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
80cabdff1aSopenharmony_ci        src1 = __lsx_vilvl_d(tmp1, tmp0);
81cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
82cabdff1aSopenharmony_ci
83cabdff1aSopenharmony_ci        tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
84cabdff1aSopenharmony_ci        tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
85cabdff1aSopenharmony_ci        tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
86cabdff1aSopenharmony_ci        tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
87cabdff1aSopenharmony_ci        src1_ptr += src2_stride_4x;
88cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in0, in1);
89cabdff1aSopenharmony_ci        tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
90cabdff1aSopenharmony_ci        tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
91cabdff1aSopenharmony_ci        tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
92cabdff1aSopenharmony_ci        tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
93cabdff1aSopenharmony_ci        src1_ptr += src2_stride_4x;
94cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in2, in3);
95cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst0, dst2);
96cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, dst1, dst3);
97cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst1, dst3);
98cabdff1aSopenharmony_ci        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
99cabdff1aSopenharmony_ci        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
100cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0, dst, 0, 0);
101cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
102cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0, dst + dst_stride_2x, 0, 2);
103cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0, dst + dst_stride_3x, 0, 3);
104cabdff1aSopenharmony_ci        dst += dst_stride_4x;
105cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst1, dst, 0, 0);
106cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst1, dst + dst_stride, 0, 1);
107cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst1, dst + dst_stride_2x, 0, 2);
108cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst1, dst + dst_stride_3x, 0, 3);
109cabdff1aSopenharmony_ci        dst += dst_stride_4x;
110cabdff1aSopenharmony_ci    }
111cabdff1aSopenharmony_ci    for(;res--;) {
112cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
113cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
114cabdff1aSopenharmony_ci        reg2 = __lsx_vldrepl_d(src1_ptr, 0);
115cabdff1aSopenharmony_ci        reg3 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
116cabdff1aSopenharmony_ci        src0 = __lsx_vilvl_w(reg1, reg0);
117cabdff1aSopenharmony_ci        in0  = __lsx_vilvl_d(reg3, reg2);
118cabdff1aSopenharmony_ci        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
119cabdff1aSopenharmony_ci        dst0 = __lsx_vsadd_h(dst0, in0);
120cabdff1aSopenharmony_ci        dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7);
121cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0, dst, 0, 0);
122cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
123cabdff1aSopenharmony_ci        src0_ptr += src_stride_2x;
124cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
125cabdff1aSopenharmony_ci        dst += dst_stride_2x;
126cabdff1aSopenharmony_ci    }
127cabdff1aSopenharmony_ci}
128cabdff1aSopenharmony_ci
129cabdff1aSopenharmony_cistatic
130cabdff1aSopenharmony_civoid hevc_bi_copy_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
131cabdff1aSopenharmony_ci                         int16_t *src1_ptr, int32_t src2_stride,
132cabdff1aSopenharmony_ci                         uint8_t *dst, int32_t dst_stride, int32_t height)
133cabdff1aSopenharmony_ci{
134cabdff1aSopenharmony_ci    int32_t loop_cnt;
135cabdff1aSopenharmony_ci    int32_t res = (height & 0x07) >> 1;
136cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
137cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
138cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
139cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
140cabdff1aSopenharmony_ci    int32_t src2_stride_x = (src2_stride << 1);
141cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 2);
142cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
143cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
144cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
145cabdff1aSopenharmony_ci    __m128i out0, out1, out2, out3;
146cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
147cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
148cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
149cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
150cabdff1aSopenharmony_ci    __m128i reg0, reg1, reg2, reg3;
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
153cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
154cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
155cabdff1aSopenharmony_ci        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
156cabdff1aSopenharmony_ci        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
157cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
158cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
159cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
160cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
161cabdff1aSopenharmony_ci        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
162cabdff1aSopenharmony_ci        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
163cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
164cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
165cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr, 0);
166cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
167cabdff1aSopenharmony_ci                  src2_stride_2x, in1, in2);
168cabdff1aSopenharmony_ci        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
169cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
170cabdff1aSopenharmony_ci        in4 = __lsx_vld(src1_ptr, 0);
171cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
172cabdff1aSopenharmony_ci                  src2_stride_2x, in5, in6);
173cabdff1aSopenharmony_ci        in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
174cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
175cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
176cabdff1aSopenharmony_ci                  dst0, dst2, dst4, dst6);
177cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
178cabdff1aSopenharmony_ci                  dst1, dst3, dst5, dst7);
179cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, dst3,
180cabdff1aSopenharmony_ci                  dst5, dst7);
181cabdff1aSopenharmony_ci        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
182cabdff1aSopenharmony_ci        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
183cabdff1aSopenharmony_ci        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
184cabdff1aSopenharmony_ci        out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
185cabdff1aSopenharmony_ci        __lsx_vstelm_w(out0, dst, 0, 0);
186cabdff1aSopenharmony_ci        __lsx_vstelm_w(out0, dst + dst_stride, 0, 2);
187cabdff1aSopenharmony_ci        __lsx_vstelm_h(out0, dst, 4, 2);
188cabdff1aSopenharmony_ci        __lsx_vstelm_h(out0, dst + dst_stride, 4, 6);
189cabdff1aSopenharmony_ci        __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 0);
190cabdff1aSopenharmony_ci        __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 2);
191cabdff1aSopenharmony_ci        __lsx_vstelm_h(out1, dst + dst_stride_2x, 4, 2);
192cabdff1aSopenharmony_ci        __lsx_vstelm_h(out1, dst + dst_stride_3x, 4, 6);
193cabdff1aSopenharmony_ci        dst += dst_stride_4x;
194cabdff1aSopenharmony_ci        __lsx_vstelm_w(out2, dst, 0, 0);
195cabdff1aSopenharmony_ci        __lsx_vstelm_w(out2, dst + dst_stride, 0, 2);
196cabdff1aSopenharmony_ci        __lsx_vstelm_h(out2, dst, 4, 2);
197cabdff1aSopenharmony_ci        __lsx_vstelm_h(out2, dst + dst_stride, 4, 6);
198cabdff1aSopenharmony_ci        __lsx_vstelm_w(out3, dst + dst_stride_2x, 0, 0);
199cabdff1aSopenharmony_ci        __lsx_vstelm_w(out3, dst + dst_stride_3x, 0, 2);
200cabdff1aSopenharmony_ci        __lsx_vstelm_h(out3, dst + dst_stride_2x, 4, 2);
201cabdff1aSopenharmony_ci        __lsx_vstelm_h(out3, dst + dst_stride_3x, 4, 6);
202cabdff1aSopenharmony_ci        dst += dst_stride_4x;
203cabdff1aSopenharmony_ci    }
204cabdff1aSopenharmony_ci    for (;res--;) {
205cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
206cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
207cabdff1aSopenharmony_ci        src0 = __lsx_vilvl_d(reg1, reg0);
208cabdff1aSopenharmony_ci        src0_ptr += src_stride_2x;
209cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr, 0);
210cabdff1aSopenharmony_ci        in1 = __lsx_vldx(src1_ptr, src2_stride_x);
211cabdff1aSopenharmony_ci        src1_ptr += src2_stride_x;
212cabdff1aSopenharmony_ci        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
213cabdff1aSopenharmony_ci        dst1 = __lsx_vilvh_b(zero, src0);
214cabdff1aSopenharmony_ci        dst1 = __lsx_vslli_h(dst1, 6);
215cabdff1aSopenharmony_ci        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
216cabdff1aSopenharmony_ci        __lsx_vstelm_w(out0, dst, 0, 0);
217cabdff1aSopenharmony_ci        __lsx_vstelm_h(out0, dst, 4, 2);
218cabdff1aSopenharmony_ci        dst += dst_stride;
219cabdff1aSopenharmony_ci        __lsx_vstelm_w(out0, dst, 0, 2);
220cabdff1aSopenharmony_ci        __lsx_vstelm_h(out0, dst, 4, 6);
221cabdff1aSopenharmony_ci        dst += dst_stride;
222cabdff1aSopenharmony_ci    }
223cabdff1aSopenharmony_ci}
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_cistatic
226cabdff1aSopenharmony_civoid hevc_bi_copy_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
227cabdff1aSopenharmony_ci                         int16_t *src1_ptr, int32_t src2_stride,
228cabdff1aSopenharmony_ci                         uint8_t *dst, int32_t dst_stride, int32_t height)
229cabdff1aSopenharmony_ci{
230cabdff1aSopenharmony_ci    int32_t loop_cnt = height >> 3;
231cabdff1aSopenharmony_ci    int32_t res = (height & 7) >> 1;
232cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
233cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
234cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
235cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
236cabdff1aSopenharmony_ci    int32_t src2_stride_x = (src2_stride << 1);
237cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 2);
238cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
239cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
240cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
241cabdff1aSopenharmony_ci    __m128i out0, out1, out2, out3;
242cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
243cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
244cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
245cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
246cabdff1aSopenharmony_ci    __m128i reg0, reg1, reg2, reg3;
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
249cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
250cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
251cabdff1aSopenharmony_ci        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
252cabdff1aSopenharmony_ci        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
253cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
254cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
255cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
256cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
257cabdff1aSopenharmony_ci        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
258cabdff1aSopenharmony_ci        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
259cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
260cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
261cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
262cabdff1aSopenharmony_ci                  dst0, dst2, dst4, dst6);
263cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
264cabdff1aSopenharmony_ci                  src3, dst1, dst3, dst5, dst7);
265cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1,
266cabdff1aSopenharmony_ci                  dst3, dst5, dst7);
267cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr, 0);
268cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
269cabdff1aSopenharmony_ci                  src2_stride_2x, in1, in2);
270cabdff1aSopenharmony_ci        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
271cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
272cabdff1aSopenharmony_ci        in4 = __lsx_vld(src1_ptr, 0);
273cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
274cabdff1aSopenharmony_ci                  src2_stride_2x, in5, in6);
275cabdff1aSopenharmony_ci        in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
276cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
277cabdff1aSopenharmony_ci        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
278cabdff1aSopenharmony_ci        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
279cabdff1aSopenharmony_ci        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
280cabdff1aSopenharmony_ci        out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
281cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst, 0, 0);
282cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
283cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
284cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
285cabdff1aSopenharmony_ci        dst += dst_stride_4x;
286cabdff1aSopenharmony_ci        __lsx_vstelm_d(out2, dst, 0, 0);
287cabdff1aSopenharmony_ci        __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
288cabdff1aSopenharmony_ci        __lsx_vstelm_d(out3, dst + dst_stride_2x, 0, 0);
289cabdff1aSopenharmony_ci        __lsx_vstelm_d(out3, dst + dst_stride_3x, 0, 1);
290cabdff1aSopenharmony_ci        dst += dst_stride_4x;
291cabdff1aSopenharmony_ci    }
292cabdff1aSopenharmony_ci    for (;res--;) {
293cabdff1aSopenharmony_ci        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
294cabdff1aSopenharmony_ci        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
295cabdff1aSopenharmony_ci        src0 = __lsx_vilvl_d(reg1, reg0);
296cabdff1aSopenharmony_ci        in0  = __lsx_vld(src1_ptr, 0);
297cabdff1aSopenharmony_ci        in1  = __lsx_vldx(src1_ptr, src2_stride_x);
298cabdff1aSopenharmony_ci        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
299cabdff1aSopenharmony_ci        dst1 = __lsx_vilvh_b(zero, src0);
300cabdff1aSopenharmony_ci        dst1 = __lsx_vslli_h(dst1, 6);
301cabdff1aSopenharmony_ci        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
302cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst, 0, 0);
303cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
304cabdff1aSopenharmony_ci        src0_ptr += src_stride_2x;
305cabdff1aSopenharmony_ci        src1_ptr += src2_stride_x;
306cabdff1aSopenharmony_ci        dst += dst_stride_2x;
307cabdff1aSopenharmony_ci    }
308cabdff1aSopenharmony_ci}
309cabdff1aSopenharmony_ci
310cabdff1aSopenharmony_cistatic
311cabdff1aSopenharmony_civoid hevc_bi_copy_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
312cabdff1aSopenharmony_ci                          int16_t *src1_ptr, int32_t src2_stride,
313cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride, int32_t height)
314cabdff1aSopenharmony_ci{
315cabdff1aSopenharmony_ci    uint32_t loop_cnt;
316cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
317cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
318cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
319cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
320cabdff1aSopenharmony_ci    int32_t src2_stride_x = (src2_stride << 1);
321cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 2);
322cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
323cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
324cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
325cabdff1aSopenharmony_ci    int16_t* _src1 = src1_ptr + 8;
326cabdff1aSopenharmony_ci    __m128i out0, out1, out2;
327cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
328cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
329cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
330cabdff1aSopenharmony_ci
331cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
332cabdff1aSopenharmony_ci        src0 = __lsx_vld(src0_ptr, 0);
333cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
334cabdff1aSopenharmony_ci                  src1, src2);
335cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
336cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
337cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr, 0);
338cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
339cabdff1aSopenharmony_ci                  src2_stride_2x, in1, in2);
340cabdff1aSopenharmony_ci        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
341cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
342cabdff1aSopenharmony_ci        in4 = __lsx_vld(_src1, 0);
343cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
344cabdff1aSopenharmony_ci                  in5, in6);
345cabdff1aSopenharmony_ci        in7 = __lsx_vldx(_src1, src2_stride_3x);
346cabdff1aSopenharmony_ci        _src1 += src2_stride_2x;
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
349cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
350cabdff1aSopenharmony_ci                  dst0, dst1, dst2, dst3)
351cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
352cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst4, dst5)
353cabdff1aSopenharmony_ci        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
354cabdff1aSopenharmony_ci        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
355cabdff1aSopenharmony_ci        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
356cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst, 0, 0);
357cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
358cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
359cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
360cabdff1aSopenharmony_ci        __lsx_vstelm_w(out2, dst, 8, 0);
361cabdff1aSopenharmony_ci        __lsx_vstelm_w(out2, dst + dst_stride, 8, 1);
362cabdff1aSopenharmony_ci        __lsx_vstelm_w(out2, dst + dst_stride_2x, 8, 2);
363cabdff1aSopenharmony_ci        __lsx_vstelm_w(out2, dst + dst_stride_3x, 8, 3);
364cabdff1aSopenharmony_ci        dst += dst_stride_4x;
365cabdff1aSopenharmony_ci    }
366cabdff1aSopenharmony_ci}
367cabdff1aSopenharmony_ci
368cabdff1aSopenharmony_cistatic
369cabdff1aSopenharmony_civoid hevc_bi_copy_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
370cabdff1aSopenharmony_ci                          int16_t *src1_ptr, int32_t src2_stride,
371cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride, int32_t height)
372cabdff1aSopenharmony_ci{
373cabdff1aSopenharmony_ci    uint32_t loop_cnt;
374cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
375cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
376cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
377cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
378cabdff1aSopenharmony_ci    int32_t src2_stride_x = (src2_stride << 1);
379cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 2);
380cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
381cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
382cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
383cabdff1aSopenharmony_ci    int16_t *_src1 = src1_ptr + 8;
384cabdff1aSopenharmony_ci    __m128i out0, out1, out2, out3;
385cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
386cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
387cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
388cabdff1aSopenharmony_ci    __m128i zero = {0};
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
391cabdff1aSopenharmony_ci        src0 = __lsx_vld(src0_ptr, 0);
392cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
393cabdff1aSopenharmony_ci                  src1, src2);
394cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
395cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
396cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr, 0);
397cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
398cabdff1aSopenharmony_ci                  src2_stride_2x, in1, in2);
399cabdff1aSopenharmony_ci        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
400cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
401cabdff1aSopenharmony_ci        in4 = __lsx_vld(_src1, 0);
402cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
403cabdff1aSopenharmony_ci                  in5, in6);
404cabdff1aSopenharmony_ci        in7 = __lsx_vldx(_src1, src2_stride_3x);
405cabdff1aSopenharmony_ci        _src1 += src2_stride_2x;
406cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
407cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r)
408cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
409cabdff1aSopenharmony_ci                  dst0_l, dst1_l, dst2_l, dst3_l);
410cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
411cabdff1aSopenharmony_ci                  dst0_l, dst1_l, dst2_l, dst3_l);
412cabdff1aSopenharmony_ci
413cabdff1aSopenharmony_ci        out0 = hevc_bi_rnd_clip(in0, dst0_r, in4, dst0_l);
414cabdff1aSopenharmony_ci        out1 = hevc_bi_rnd_clip(in1, dst1_r, in5, dst1_l);
415cabdff1aSopenharmony_ci        out2 = hevc_bi_rnd_clip(in2, dst2_r, in6, dst2_l);
416cabdff1aSopenharmony_ci        out3 = hevc_bi_rnd_clip(in3, dst3_r, in7, dst3_l);
417cabdff1aSopenharmony_ci        __lsx_vst(out0, dst, 0);
418cabdff1aSopenharmony_ci        __lsx_vstx(out1, dst, dst_stride);
419cabdff1aSopenharmony_ci        __lsx_vstx(out2, dst, dst_stride_2x);
420cabdff1aSopenharmony_ci        __lsx_vstx(out3, dst, dst_stride_3x);
421cabdff1aSopenharmony_ci        dst += dst_stride_4x;
422cabdff1aSopenharmony_ci    }
423cabdff1aSopenharmony_ci}
424cabdff1aSopenharmony_ci
425cabdff1aSopenharmony_cistatic
426cabdff1aSopenharmony_civoid hevc_bi_copy_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
427cabdff1aSopenharmony_ci                          int16_t *src1_ptr, int32_t src2_stride,
428cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride, int32_t height)
429cabdff1aSopenharmony_ci{
430cabdff1aSopenharmony_ci    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
431cabdff1aSopenharmony_ci                         dst, dst_stride, height);
432cabdff1aSopenharmony_ci    hevc_bi_copy_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
433cabdff1aSopenharmony_ci                         dst + 16, dst_stride, height);
434cabdff1aSopenharmony_ci}
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_cistatic
437cabdff1aSopenharmony_civoid hevc_bi_copy_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
438cabdff1aSopenharmony_ci                          int16_t *src1_ptr, int32_t src2_stride,
439cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride, int32_t height)
440cabdff1aSopenharmony_ci{
441cabdff1aSopenharmony_ci    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
442cabdff1aSopenharmony_ci                         dst, dst_stride, height);
443cabdff1aSopenharmony_ci    hevc_bi_copy_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
444cabdff1aSopenharmony_ci                         dst + 16, dst_stride, height);
445cabdff1aSopenharmony_ci}
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_cistatic
448cabdff1aSopenharmony_civoid hevc_bi_copy_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
449cabdff1aSopenharmony_ci                          int16_t *src1_ptr, int32_t src2_stride,
450cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride, int32_t height)
451cabdff1aSopenharmony_ci{
452cabdff1aSopenharmony_ci    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
453cabdff1aSopenharmony_ci                         dst, dst_stride, height);
454cabdff1aSopenharmony_ci    hevc_bi_copy_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
455cabdff1aSopenharmony_ci                         dst + 16, dst_stride, height);
456cabdff1aSopenharmony_ci}
457cabdff1aSopenharmony_ci
458cabdff1aSopenharmony_cistatic
459cabdff1aSopenharmony_civoid hevc_bi_copy_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
460cabdff1aSopenharmony_ci                          int16_t *src1_ptr, int32_t src2_stride,
461cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride, int32_t height)
462cabdff1aSopenharmony_ci{
463cabdff1aSopenharmony_ci    hevc_bi_copy_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
464cabdff1aSopenharmony_ci                         dst, dst_stride, height);
465cabdff1aSopenharmony_ci    hevc_bi_copy_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
466cabdff1aSopenharmony_ci                         dst + 32, dst_stride, height);
467cabdff1aSopenharmony_ci}
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_cistatic void hevc_hz_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
470cabdff1aSopenharmony_ci                               int16_t *src1_ptr,  int32_t src2_stride,
471cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
472cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
473cabdff1aSopenharmony_ci{
474cabdff1aSopenharmony_ci    uint32_t loop_cnt;
475cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
476cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
477cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
478cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
479cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
480cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
481cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
482cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
483cabdff1aSopenharmony_ci
484cabdff1aSopenharmony_ci    src0_ptr -= 3;
485cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
486cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
487cabdff1aSopenharmony_ci
488cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
489cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
490cabdff1aSopenharmony_ci
491cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
492cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src0, src1);
493cabdff1aSopenharmony_ci        src0_ptr += src_stride;
494cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src2, src3);
495cabdff1aSopenharmony_ci        src0_ptr += src_stride;
496cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
497cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
498cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
499cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
500cabdff1aSopenharmony_ci
501cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
502cabdff1aSopenharmony_ci                  vec0, vec1);
503cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
504cabdff1aSopenharmony_ci                  vec2, vec3);
505cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
506cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
507cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
508cabdff1aSopenharmony_ci                  vec0, vec1);
509cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
510cabdff1aSopenharmony_ci                  vec2, vec3);
511cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
512cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
513cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
514cabdff1aSopenharmony_ci                  vec0, vec1);
515cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
516cabdff1aSopenharmony_ci                  vec2, vec3);
517cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
518cabdff1aSopenharmony_ci                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
519cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
520cabdff1aSopenharmony_ci                  vec0, vec1);
521cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
522cabdff1aSopenharmony_ci                  vec2, vec3);
523cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
524cabdff1aSopenharmony_ci                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
525cabdff1aSopenharmony_ci
526cabdff1aSopenharmony_ci        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
527cabdff1aSopenharmony_ci        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
528cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
529cabdff1aSopenharmony_ci        __lsx_vstx(dst1, dst, dst_stride);
530cabdff1aSopenharmony_ci        dst += dst_stride_2x;
531cabdff1aSopenharmony_ci    }
532cabdff1aSopenharmony_ci}
533cabdff1aSopenharmony_ci
534cabdff1aSopenharmony_cistatic void hevc_hz_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
535cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
536cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
537cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
538cabdff1aSopenharmony_ci{
539cabdff1aSopenharmony_ci    uint32_t loop_cnt;
540cabdff1aSopenharmony_ci    __m128i src0, src1, tmp0, tmp1;
541cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
542cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
543cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
544cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2;
545cabdff1aSopenharmony_ci    __m128i in0, in1, in2;
546cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
547cabdff1aSopenharmony_ci
548cabdff1aSopenharmony_ci    src0_ptr -= 3;
549cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
550cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
551cabdff1aSopenharmony_ci
552cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
553cabdff1aSopenharmony_ci              mask2, mask3, mask4);
554cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
555cabdff1aSopenharmony_ci    mask7 = __lsx_vaddi_bu(mask0, 14);
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
558cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
559cabdff1aSopenharmony_ci        src0_ptr += src_stride;
560cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
561cabdff1aSopenharmony_ci        in2 = __lsx_vld(src1_ptr, 32);
562cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
563cabdff1aSopenharmony_ci
564cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
565cabdff1aSopenharmony_ci                  src1, mask0, src0, src0, mask1, vec0, vec1, vec2, vec3);
566cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1);
567cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec2, filt0);
568cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt1);
569cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask5, src1, src1, mask1, src0,
570cabdff1aSopenharmony_ci                  src0, mask2, src1, src0, mask6, vec0, vec1, vec2, vec3);
571cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec0, filt1, dst2, vec1, filt1,
572cabdff1aSopenharmony_ci                  dst0, vec2, filt2, dst1, vec3, filt2, dst1, dst2, dst0, dst1);
573cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask2, src0, src0, mask3, src1, src0,
574cabdff1aSopenharmony_ci                  mask7, src1, src1, mask3, vec0, vec1, vec2, vec3);
575cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec0, filt2, dst0, vec1, filt3,
576cabdff1aSopenharmony_ci                  dst1, vec2, filt3, dst2, vec3, filt3, dst2, dst0, dst1, dst2);
577cabdff1aSopenharmony_ci
578cabdff1aSopenharmony_ci        tmp0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
579cabdff1aSopenharmony_ci        dst2 = __lsx_vsadd_h(dst2, in2);
580cabdff1aSopenharmony_ci        tmp1 = __lsx_vssrarni_bu_h(dst2, dst2, 7);
581cabdff1aSopenharmony_ci
582cabdff1aSopenharmony_ci        __lsx_vst(tmp0, dst, 0);
583cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp1, dst, 16, 0);
584cabdff1aSopenharmony_ci        dst += dst_stride;
585cabdff1aSopenharmony_ci    }
586cabdff1aSopenharmony_ci}
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_cistatic void hevc_hz_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
589cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
590cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
591cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
592cabdff1aSopenharmony_ci{
593cabdff1aSopenharmony_ci    hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
594cabdff1aSopenharmony_ci                       dst, dst_stride, filter, height);
595cabdff1aSopenharmony_ci    hevc_hz_8t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
596cabdff1aSopenharmony_ci                       dst + 16, dst_stride, filter, height);
597cabdff1aSopenharmony_ci}
598cabdff1aSopenharmony_ci
599cabdff1aSopenharmony_cistatic void hevc_hz_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
600cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
601cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
602cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
603cabdff1aSopenharmony_ci{
604cabdff1aSopenharmony_ci    hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
605cabdff1aSopenharmony_ci                       dst, dst_stride, filter, height);
606cabdff1aSopenharmony_ci    hevc_hz_8t_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
607cabdff1aSopenharmony_ci                       dst + 16, dst_stride, filter, height);
608cabdff1aSopenharmony_ci}
609cabdff1aSopenharmony_ci
610cabdff1aSopenharmony_cistatic void hevc_hz_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
611cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
612cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
613cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
614cabdff1aSopenharmony_ci{
615cabdff1aSopenharmony_ci    hevc_hz_8t_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
616cabdff1aSopenharmony_ci                       dst, dst_stride, filter, height);
617cabdff1aSopenharmony_ci    hevc_hz_8t_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
618cabdff1aSopenharmony_ci                       dst + 32, dst_stride, filter, height);
619cabdff1aSopenharmony_ci}
620cabdff1aSopenharmony_ci
621cabdff1aSopenharmony_cistatic av_always_inline
622cabdff1aSopenharmony_civoid hevc_vt_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
623cabdff1aSopenharmony_ci                       int32_t src2_stride, uint8_t *dst, int32_t dst_stride,\
624cabdff1aSopenharmony_ci                       const int8_t *filter, int32_t height)
625cabdff1aSopenharmony_ci{
626cabdff1aSopenharmony_ci    int32_t loop_cnt;
627cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
628cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
629cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
630cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
631cabdff1aSopenharmony_ci    int32_t src2_stride_x = (src2_stride << 1);
632cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 2);
633cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
634cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
635cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
636cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5;
637cabdff1aSopenharmony_ci    __m128i src6, src7, src8, src9, src10;
638cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
639cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
640cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
641cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
642cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci    src0_ptr -= src_stride_3x;
645cabdff1aSopenharmony_ci
646cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
647cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
648cabdff1aSopenharmony_ci
649cabdff1aSopenharmony_ci    src0 = __lsx_vld(src0_ptr, 0);
650cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
651cabdff1aSopenharmony_ci              src1, src2);
652cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src0_ptr, src_stride_3x);
653cabdff1aSopenharmony_ci    src0_ptr += src_stride_4x;
654cabdff1aSopenharmony_ci    src4 = __lsx_vld(src0_ptr, 0);
655cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
656cabdff1aSopenharmony_ci              src5, src6);
657cabdff1aSopenharmony_ci    src0_ptr += src_stride_3x;
658cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
659cabdff1aSopenharmony_ci              src10_r, src32_r, src54_r, src21_r);
660cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
661cabdff1aSopenharmony_ci
662cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
663cabdff1aSopenharmony_ci        src7 = __lsx_vld(src0_ptr, 0);
664cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
665cabdff1aSopenharmony_ci                  src8, src9);
666cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src0_ptr, src_stride_3x);
667cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
668cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr, 0);
669cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
670cabdff1aSopenharmony_ci                  in1, in2);
671cabdff1aSopenharmony_ci        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
672cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
673cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
674cabdff1aSopenharmony_ci                  src76_r, src87_r, src98_r, src109_r);
675cabdff1aSopenharmony_ci
676cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
677cabdff1aSopenharmony_ci                  filt0, src43_r, filt0, dst0_r, dst1_r, dst2_r, dst3_r);
678cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
679cabdff1aSopenharmony_ci                  filt1, dst2_r, src54_r, filt1, dst3_r, src65_r, filt1,
680cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
681cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, src65_r,
682cabdff1aSopenharmony_ci                  filt2, dst2_r, src76_r, filt2, dst3_r, src87_r, filt2,
683cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
684cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, src87_r,
685cabdff1aSopenharmony_ci                  filt3, dst2_r, src98_r, filt3, dst3_r, src109_r, filt3,
686cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
687cabdff1aSopenharmony_ci
688cabdff1aSopenharmony_ci        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
689cabdff1aSopenharmony_ci        dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
690cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_r, dst, 0, 0);
691cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
692cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
693cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
694cabdff1aSopenharmony_ci        dst += dst_stride_4x;
695cabdff1aSopenharmony_ci
696cabdff1aSopenharmony_ci        src10_r = src54_r;
697cabdff1aSopenharmony_ci        src32_r = src76_r;
698cabdff1aSopenharmony_ci        src54_r = src98_r;
699cabdff1aSopenharmony_ci        src21_r = src65_r;
700cabdff1aSopenharmony_ci        src43_r = src87_r;
701cabdff1aSopenharmony_ci        src65_r = src109_r;
702cabdff1aSopenharmony_ci
703cabdff1aSopenharmony_ci        src6 = src10;
704cabdff1aSopenharmony_ci    }
705cabdff1aSopenharmony_ci}
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_cistatic av_always_inline
708cabdff1aSopenharmony_civoid hevc_vt_8t_16multx2mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
709cabdff1aSopenharmony_ci                                 int16_t *src1_ptr, int32_t src2_stride,
710cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
711cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height,
712cabdff1aSopenharmony_ci                                 int32_t width)
713cabdff1aSopenharmony_ci{
714cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp;
715cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
716cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
717cabdff1aSopenharmony_ci    uint32_t loop_cnt;
718cabdff1aSopenharmony_ci    uint32_t cnt;
719cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
720cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
721cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
722cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
723cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
724cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
725cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src54_r, src76_r;
726cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src65_r, src87_r;
727cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r;
728cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src54_l, src76_l;
729cabdff1aSopenharmony_ci    __m128i src21_l, src43_l, src65_l, src87_l;
730cabdff1aSopenharmony_ci    __m128i dst0_l, dst1_l;
731cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
732cabdff1aSopenharmony_ci
733cabdff1aSopenharmony_ci    src0_ptr -= src_stride_3x;
734cabdff1aSopenharmony_ci
735cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
736cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
737cabdff1aSopenharmony_ci
738cabdff1aSopenharmony_ci    for (cnt = (width >> 4); cnt--;) {
739cabdff1aSopenharmony_ci        src0_ptr_tmp = src0_ptr;
740cabdff1aSopenharmony_ci        src1_ptr_tmp = src1_ptr;
741cabdff1aSopenharmony_ci        dst_tmp = dst;
742cabdff1aSopenharmony_ci
743cabdff1aSopenharmony_ci        src0 = __lsx_vld(src0_ptr_tmp, 0);
744cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
745cabdff1aSopenharmony_ci                  src_stride_2x, src1, src2);
746cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
747cabdff1aSopenharmony_ci        src0_ptr_tmp += src_stride_4x;
748cabdff1aSopenharmony_ci        src4 = __lsx_vld(src0_ptr_tmp, 0);
749cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
750cabdff1aSopenharmony_ci                  src_stride_2x, src5, src6);
751cabdff1aSopenharmony_ci        src0_ptr_tmp += src_stride_3x;
752cabdff1aSopenharmony_ci
753cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
754cabdff1aSopenharmony_ci                  src10_r, src32_r, src54_r, src21_r);
755cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
756cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
757cabdff1aSopenharmony_ci                  src10_l, src32_l, src54_l, src21_l);
758cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
759cabdff1aSopenharmony_ci
760cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 1); loop_cnt--;) {
761cabdff1aSopenharmony_ci            src7 = __lsx_vld(src0_ptr_tmp, 0);
762cabdff1aSopenharmony_ci            src8 = __lsx_vldx(src0_ptr_tmp, src_stride);
763cabdff1aSopenharmony_ci            src0_ptr_tmp += src_stride_2x;
764cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in0, in2);
765cabdff1aSopenharmony_ci            src1_ptr_tmp += src2_stride;
766cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in1, in3);
767cabdff1aSopenharmony_ci            src1_ptr_tmp += src2_stride;
768cabdff1aSopenharmony_ci
769cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
770cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
771cabdff1aSopenharmony_ci
772cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
773cabdff1aSopenharmony_ci                      filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
774cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
775cabdff1aSopenharmony_ci                      src43_r, filt1, dst0_l, src32_l, filt1, dst1_l, src43_l,
776cabdff1aSopenharmony_ci                      filt1, dst0_r, dst1_r, dst0_l, dst1_l);
777cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r,
778cabdff1aSopenharmony_ci                      src65_r, filt2, dst0_l, src54_l, filt2, dst1_l, src65_l,
779cabdff1aSopenharmony_ci                      filt2, dst0_r, dst1_r, dst0_l, dst1_l);
780cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r,
781cabdff1aSopenharmony_ci                      src87_r, filt3, dst0_l, src76_l, filt3, dst1_l, src87_l,
782cabdff1aSopenharmony_ci                      filt3, dst0_r, dst1_r, dst0_l, dst1_l);
783cabdff1aSopenharmony_ci            dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
784cabdff1aSopenharmony_ci            dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci            __lsx_vst(dst0_r, dst_tmp, 0);
787cabdff1aSopenharmony_ci            __lsx_vstx(dst1_r, dst_tmp, dst_stride);
788cabdff1aSopenharmony_ci            dst_tmp += dst_stride_2x;
789cabdff1aSopenharmony_ci
790cabdff1aSopenharmony_ci            src10_r = src32_r;
791cabdff1aSopenharmony_ci            src32_r = src54_r;
792cabdff1aSopenharmony_ci            src54_r = src76_r;
793cabdff1aSopenharmony_ci            src21_r = src43_r;
794cabdff1aSopenharmony_ci            src43_r = src65_r;
795cabdff1aSopenharmony_ci            src65_r = src87_r;
796cabdff1aSopenharmony_ci            src10_l = src32_l;
797cabdff1aSopenharmony_ci            src32_l = src54_l;
798cabdff1aSopenharmony_ci            src54_l = src76_l;
799cabdff1aSopenharmony_ci            src21_l = src43_l;
800cabdff1aSopenharmony_ci            src43_l = src65_l;
801cabdff1aSopenharmony_ci            src65_l = src87_l;
802cabdff1aSopenharmony_ci            src6 = src8;
803cabdff1aSopenharmony_ci        }
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_ci        src0_ptr += 16;
806cabdff1aSopenharmony_ci        src1_ptr += 16;
807cabdff1aSopenharmony_ci        dst += 16;
808cabdff1aSopenharmony_ci    }
809cabdff1aSopenharmony_ci}
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_cistatic void hevc_vt_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
812cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
813cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
814cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
815cabdff1aSopenharmony_ci{
816cabdff1aSopenharmony_ci    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
817cabdff1aSopenharmony_ci                                dst, dst_stride, filter, height, 16);
818cabdff1aSopenharmony_ci}
819cabdff1aSopenharmony_ci
820cabdff1aSopenharmony_cistatic void hevc_vt_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
821cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
822cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
823cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
824cabdff1aSopenharmony_ci{
825cabdff1aSopenharmony_ci    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
826cabdff1aSopenharmony_ci                                dst, dst_stride, filter, height, 16);
827cabdff1aSopenharmony_ci    hevc_vt_8t_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
828cabdff1aSopenharmony_ci                      dst + 16, dst_stride, filter, height);
829cabdff1aSopenharmony_ci}
830cabdff1aSopenharmony_ci
831cabdff1aSopenharmony_cistatic void hevc_vt_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
832cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
833cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
834cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
835cabdff1aSopenharmony_ci{
836cabdff1aSopenharmony_ci    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
837cabdff1aSopenharmony_ci                                dst, dst_stride, filter, height, 32);
838cabdff1aSopenharmony_ci}
839cabdff1aSopenharmony_ci
840cabdff1aSopenharmony_cistatic void hevc_vt_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
841cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
842cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
843cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
844cabdff1aSopenharmony_ci{
845cabdff1aSopenharmony_ci    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
846cabdff1aSopenharmony_ci                                dst, dst_stride, filter, height, 48);
847cabdff1aSopenharmony_ci}
848cabdff1aSopenharmony_ci
849cabdff1aSopenharmony_cistatic void hevc_vt_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
850cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
851cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
852cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
853cabdff1aSopenharmony_ci{
854cabdff1aSopenharmony_ci    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
855cabdff1aSopenharmony_ci                                dst, dst_stride, filter, height, 64);
856cabdff1aSopenharmony_ci}
857cabdff1aSopenharmony_ci
858cabdff1aSopenharmony_cistatic av_always_inline
859cabdff1aSopenharmony_civoid hevc_hv_8t_8multx1mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
860cabdff1aSopenharmony_ci                                int16_t *src1_ptr, int32_t src2_stride,
861cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
862cabdff1aSopenharmony_ci                                const int8_t *filter_x, const int8_t *filter_y,
863cabdff1aSopenharmony_ci                                int32_t height, int32_t width)
864cabdff1aSopenharmony_ci{
865cabdff1aSopenharmony_ci    uint32_t loop_cnt;
866cabdff1aSopenharmony_ci    uint32_t cnt;
867cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp;
868cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
869cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
870cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
871cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
872cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
873cabdff1aSopenharmony_ci    __m128i out;
874cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
875cabdff1aSopenharmony_ci    __m128i in0, tmp;
876cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
877cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
878cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
879cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
880cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
881cabdff1aSopenharmony_ci    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
882cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
883cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l;
884cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
885cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
886cabdff1aSopenharmony_ci
887cabdff1aSopenharmony_ci    src0_ptr -= src_stride_3x + 3;
888cabdff1aSopenharmony_ci
889cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, filter_x,
890cabdff1aSopenharmony_ci              6, filt0, filt1, filt2, filt3);
891cabdff1aSopenharmony_ci    filt_h3 = __lsx_vld(filter_y, 0);
892cabdff1aSopenharmony_ci    filt_h3 = __lsx_vsllwil_h_b(filt_h3, 0);
893cabdff1aSopenharmony_ci
894cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vreplvei_w, filt_h3, 0, filt_h3, 1, filt_h3, 2, filt_h3, 3,
895cabdff1aSopenharmony_ci              filt_h0, filt_h1, filt_h2, filt_h3);
896cabdff1aSopenharmony_ci
897cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
898cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
899cabdff1aSopenharmony_ci
900cabdff1aSopenharmony_ci    for (cnt = width >> 3; cnt--;) {
901cabdff1aSopenharmony_ci        src0_ptr_tmp = src0_ptr;
902cabdff1aSopenharmony_ci        dst_tmp = dst;
903cabdff1aSopenharmony_ci        src1_ptr_tmp = src1_ptr;
904cabdff1aSopenharmony_ci
905cabdff1aSopenharmony_ci        src0 = __lsx_vld(src0_ptr_tmp, 0);
906cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
907cabdff1aSopenharmony_ci                  src_stride_2x, src1, src2);
908cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
909cabdff1aSopenharmony_ci        src0_ptr_tmp += src_stride_4x;
910cabdff1aSopenharmony_ci        src4 = __lsx_vld(src0_ptr_tmp, 0);
911cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
912cabdff1aSopenharmony_ci                  src_stride_2x, src5, src6);
913cabdff1aSopenharmony_ci        src0_ptr_tmp += src_stride_3x;
914cabdff1aSopenharmony_ci
915cabdff1aSopenharmony_ci        /* row 0 row 1 row 2 row 3 */
916cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
917cabdff1aSopenharmony_ci                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
918cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
919cabdff1aSopenharmony_ci                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
920cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
921cabdff1aSopenharmony_ci                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
922cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
923cabdff1aSopenharmony_ci                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
924cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
925cabdff1aSopenharmony_ci                  vec12, filt0, dst0, dst1, dst2, dst3);
926cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
927cabdff1aSopenharmony_ci                  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
928cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
929cabdff1aSopenharmony_ci                  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
930cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
931cabdff1aSopenharmony_ci                  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
932cabdff1aSopenharmony_ci
933cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
934cabdff1aSopenharmony_ci                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
935cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
936cabdff1aSopenharmony_ci                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
937cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
938cabdff1aSopenharmony_ci                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
939cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
940cabdff1aSopenharmony_ci        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
941cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
942cabdff1aSopenharmony_ci                  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
943cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
944cabdff1aSopenharmony_ci                  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
945cabdff1aSopenharmony_ci        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
946cabdff1aSopenharmony_ci
947cabdff1aSopenharmony_ci        for (loop_cnt = height; loop_cnt--;) {
948cabdff1aSopenharmony_ci            src7 = __lsx_vld(src0_ptr_tmp, 0);
949cabdff1aSopenharmony_ci            src0_ptr_tmp += src_stride;
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_ci            in0 = __lsx_vld(src1_ptr_tmp, 0);
952cabdff1aSopenharmony_ci            src1_ptr_tmp += src2_stride;
953cabdff1aSopenharmony_ci
954cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
955cabdff1aSopenharmony_ci                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
956cabdff1aSopenharmony_ci            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
957cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
958cabdff1aSopenharmony_ci                      filt2, dst7, dst7);
959cabdff1aSopenharmony_ci            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
960cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
961cabdff1aSopenharmony_ci                      dst6, dst10_r, dst32_r, dst54_r, dst76_r);
962cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
963cabdff1aSopenharmony_ci                      dst6, dst10_l, dst32_l, dst54_l, dst76_l);
964cabdff1aSopenharmony_ci
965cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
966cabdff1aSopenharmony_ci                      dst0_r, dst0_l);
967cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
968cabdff1aSopenharmony_ci                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
969cabdff1aSopenharmony_ci                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
970cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
971cabdff1aSopenharmony_ci                      dst76_l, filt_h3, dst0_r, dst0_l);
972cabdff1aSopenharmony_ci            dst0_r = __lsx_vsrli_w(dst0_r, 6);
973cabdff1aSopenharmony_ci            dst0_l = __lsx_vsrli_w(dst0_l, 6);
974cabdff1aSopenharmony_ci
975cabdff1aSopenharmony_ci            tmp = __lsx_vpickev_h(dst0_l, dst0_r);
976cabdff1aSopenharmony_ci            tmp = __lsx_vsadd_h(tmp, in0);
977cabdff1aSopenharmony_ci            tmp = __lsx_vmaxi_h(tmp, 0);
978cabdff1aSopenharmony_ci            out = __lsx_vssrlrni_bu_h(tmp, tmp, 7);
979cabdff1aSopenharmony_ci            __lsx_vstelm_d(out, dst_tmp, 0, 0);
980cabdff1aSopenharmony_ci            dst_tmp += dst_stride;
981cabdff1aSopenharmony_ci
982cabdff1aSopenharmony_ci            dst0 = dst1;
983cabdff1aSopenharmony_ci            dst1 = dst2;
984cabdff1aSopenharmony_ci            dst2 = dst3;
985cabdff1aSopenharmony_ci            dst3 = dst4;
986cabdff1aSopenharmony_ci            dst4 = dst5;
987cabdff1aSopenharmony_ci            dst5 = dst6;
988cabdff1aSopenharmony_ci            dst6 = dst7;
989cabdff1aSopenharmony_ci        }
990cabdff1aSopenharmony_ci
991cabdff1aSopenharmony_ci        src0_ptr += 8;
992cabdff1aSopenharmony_ci        dst += 8;
993cabdff1aSopenharmony_ci        src1_ptr += 8;
994cabdff1aSopenharmony_ci    }
995cabdff1aSopenharmony_ci}
996cabdff1aSopenharmony_ci
997cabdff1aSopenharmony_cistatic void hevc_hv_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
998cabdff1aSopenharmony_ci                              int16_t *src1_ptr, int32_t src2_stride,
999cabdff1aSopenharmony_ci                              uint8_t *dst, int32_t dst_stride,
1000cabdff1aSopenharmony_ci                              const int8_t *filter_x, const int8_t *filter_y,
1001cabdff1aSopenharmony_ci                              int32_t height)
1002cabdff1aSopenharmony_ci{
1003cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1004cabdff1aSopenharmony_ci                               dst, dst_stride, filter_x, filter_y, height, 8);
1005cabdff1aSopenharmony_ci}
1006cabdff1aSopenharmony_ci
1007cabdff1aSopenharmony_cistatic void hevc_hv_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1008cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1009cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1010cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
1011cabdff1aSopenharmony_ci                               int32_t height)
1012cabdff1aSopenharmony_ci{
1013cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1014cabdff1aSopenharmony_ci                               dst, dst_stride, filter_x, filter_y, height, 16);
1015cabdff1aSopenharmony_ci}
1016cabdff1aSopenharmony_ci
1017cabdff1aSopenharmony_cistatic void hevc_hv_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1018cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1019cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1020cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
1021cabdff1aSopenharmony_ci                               int32_t height)
1022cabdff1aSopenharmony_ci{
1023cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1024cabdff1aSopenharmony_ci                               dst, dst_stride, filter_x, filter_y, height, 24);
1025cabdff1aSopenharmony_ci}
1026cabdff1aSopenharmony_ci
1027cabdff1aSopenharmony_cistatic void hevc_hv_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1028cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1029cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1030cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
1031cabdff1aSopenharmony_ci                               int32_t height)
1032cabdff1aSopenharmony_ci{
1033cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1034cabdff1aSopenharmony_ci                               dst, dst_stride, filter_x, filter_y, height, 32);
1035cabdff1aSopenharmony_ci}
1036cabdff1aSopenharmony_ci
1037cabdff1aSopenharmony_cistatic void hevc_hv_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1038cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1039cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1040cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
1041cabdff1aSopenharmony_ci                               int32_t height)
1042cabdff1aSopenharmony_ci{
1043cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1044cabdff1aSopenharmony_ci                               dst, dst_stride, filter_x, filter_y, height, 48);
1045cabdff1aSopenharmony_ci}
1046cabdff1aSopenharmony_ci
1047cabdff1aSopenharmony_cistatic void hevc_hv_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1048cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1049cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1050cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
1051cabdff1aSopenharmony_ci                               int32_t height)
1052cabdff1aSopenharmony_ci{
1053cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1054cabdff1aSopenharmony_ci                               dst, dst_stride, filter_x, filter_y, height, 64);
1055cabdff1aSopenharmony_ci}
1056cabdff1aSopenharmony_ci
1057cabdff1aSopenharmony_cistatic void hevc_hz_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1058cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1059cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1060cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1061cabdff1aSopenharmony_ci{
1062cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
1063cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1064cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1065cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
1066cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
1067cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1068cabdff1aSopenharmony_ci    int32_t src2_stride_x = src2_stride << 1;
1069cabdff1aSopenharmony_ci    int32_t src2_stride_2x = src2_stride << 2;
1070cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1071cabdff1aSopenharmony_ci
1072cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1073cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1074cabdff1aSopenharmony_ci    __m128i filt0, filt1;
1075cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1076cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
1077cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
1078cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1079cabdff1aSopenharmony_ci
1080cabdff1aSopenharmony_ci    src0_ptr -= 1;
1081cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1082cabdff1aSopenharmony_ci
1083cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
1084cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 10);
1085cabdff1aSopenharmony_ci
1086cabdff1aSopenharmony_ci    dst_tmp = dst + 16;
1087cabdff1aSopenharmony_ci    src1_ptr_tmp = src1_ptr + 16;
1088cabdff1aSopenharmony_ci
1089cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1090cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
1091cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1092cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src3);
1093cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1094cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src5);
1095cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1096cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src6, src7);
1097cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1098cabdff1aSopenharmony_ci
1099cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
1100cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1101cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
1102cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1103cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in4, in5);
1104cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1105cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in6, in7);
1106cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1107cabdff1aSopenharmony_ci
1108cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src2,
1109cabdff1aSopenharmony_ci                  src2, mask0, src3, src2, mask2, vec0, vec1, vec2, vec3);
1110cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1111cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
1112cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src2,
1113cabdff1aSopenharmony_ci                  src2, mask1, src3, src2, mask3, vec0, vec1, vec2, vec3);
1114cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1115cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1116cabdff1aSopenharmony_ci
1117cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src4, mask2, src6,
1118cabdff1aSopenharmony_ci                  src6, mask0, src7, src6, mask2, vec0, vec1, vec2, vec3);
1119cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1120cabdff1aSopenharmony_ci                  vec3, filt0, dst4, dst5, dst6, dst7);
1121cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src4, mask3, src6,
1122cabdff1aSopenharmony_ci                  src6, mask1, src7, src6, mask3, vec0, vec1, vec2, vec3);
1123cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec0, filt1, dst5, vec1, filt1,
1124cabdff1aSopenharmony_ci                  dst6, vec2, filt1, dst7, vec3, filt1, dst4, dst5, dst6, dst7);
1125cabdff1aSopenharmony_ci
1126cabdff1aSopenharmony_ci        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
1127cabdff1aSopenharmony_ci        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
1128cabdff1aSopenharmony_ci        dst2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
1129cabdff1aSopenharmony_ci        dst3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
1130cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
1131cabdff1aSopenharmony_ci        __lsx_vstx(dst1, dst, dst_stride);
1132cabdff1aSopenharmony_ci        __lsx_vstx(dst2, dst, dst_stride_2x);
1133cabdff1aSopenharmony_ci        __lsx_vstx(dst3, dst, dst_stride_3x);
1134cabdff1aSopenharmony_ci        dst += dst_stride_4x;
1135cabdff1aSopenharmony_ci
1136cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr_tmp, 0);
1137cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
1138cabdff1aSopenharmony_ci                  src2_stride_2x, in1, in2);
1139cabdff1aSopenharmony_ci        in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
1140cabdff1aSopenharmony_ci        src1_ptr_tmp += src2_stride_2x;
1141cabdff1aSopenharmony_ci
1142cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src3, src3, mask0, src5,
1143cabdff1aSopenharmony_ci                  src5, mask0, src7, src7, mask0, vec0, vec1, vec2, vec3);
1144cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1145cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
1146cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask1, src3, src3, mask1, src5,
1147cabdff1aSopenharmony_ci                  src5, mask1, src7, src7, mask1, vec0, vec1, vec2, vec3);
1148cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1149cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1150cabdff1aSopenharmony_ci        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
1151cabdff1aSopenharmony_ci        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
1152cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0, dst_tmp, 0, 0);
1153cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1);
1154cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1, dst_tmp + dst_stride_2x, 0, 0);
1155cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1, dst_tmp + dst_stride_3x, 0, 1);
1156cabdff1aSopenharmony_ci        dst_tmp += dst_stride_4x;
1157cabdff1aSopenharmony_ci    }
1158cabdff1aSopenharmony_ci}
1159cabdff1aSopenharmony_ci
1160cabdff1aSopenharmony_cistatic void hevc_hz_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1161cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1162cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1163cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1164cabdff1aSopenharmony_ci{
1165cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1166cabdff1aSopenharmony_ci    __m128i src0, src1, src2;
1167cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
1168cabdff1aSopenharmony_ci    __m128i filt0, filt1;
1169cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1170cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
1171cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
1172cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
1173cabdff1aSopenharmony_ci
1174cabdff1aSopenharmony_ci    src0_ptr -= 1;
1175cabdff1aSopenharmony_ci
1176cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1177cabdff1aSopenharmony_ci
1178cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
1179cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 10);
1180cabdff1aSopenharmony_ci
1181cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
1182cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
1183cabdff1aSopenharmony_ci        src2 = __lsx_vld(src0_ptr, 24);
1184cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1185cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, src1_ptr, 32,
1186cabdff1aSopenharmony_ci                  src1_ptr, 48, in0, in1, in2, in3);
1187cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1188cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src1,
1189cabdff1aSopenharmony_ci                  src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
1190cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1191cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
1192cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src1,
1193cabdff1aSopenharmony_ci                  src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
1194cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1195cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1196cabdff1aSopenharmony_ci        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
1197cabdff1aSopenharmony_ci        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
1198cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
1199cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 16);
1200cabdff1aSopenharmony_ci        dst += dst_stride;
1201cabdff1aSopenharmony_ci    }
1202cabdff1aSopenharmony_ci}
1203cabdff1aSopenharmony_ci
1204cabdff1aSopenharmony_cistatic void hevc_vt_4t_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1205cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1206cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1207cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1208cabdff1aSopenharmony_ci{
1209cabdff1aSopenharmony_ci    int32_t loop_cnt;
1210cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1211cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
1212cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
1213cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1214cabdff1aSopenharmony_ci    int32_t src2_stride_x = (src2_stride << 1);
1215cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 2);
1216cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1217cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1218cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1219cabdff1aSopenharmony_ci    int16_t *_src1 = src1_ptr + 8;
1220cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6;
1221cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1222cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
1223cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1224cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
1225cabdff1aSopenharmony_ci    __m128i src2110, src4332, src6554;
1226cabdff1aSopenharmony_ci    __m128i dst0_l, dst1_l, filt0, filt1;
1227cabdff1aSopenharmony_ci
1228cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
1229cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1230cabdff1aSopenharmony_ci
1231cabdff1aSopenharmony_ci    src0 = __lsx_vld(src0_ptr, 0);
1232cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1233cabdff1aSopenharmony_ci              src1, src2);
1234cabdff1aSopenharmony_ci    src0_ptr += src_stride_3x;
1235cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1236cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
1237cabdff1aSopenharmony_ci    src2110 = __lsx_vilvl_d(src21_l, src10_l);
1238cabdff1aSopenharmony_ci
1239cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1240cabdff1aSopenharmony_ci        src3 = __lsx_vld(src0_ptr, 0);
1241cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1242cabdff1aSopenharmony_ci                  src4, src5);
1243cabdff1aSopenharmony_ci        src6 = __lsx_vldx(src0_ptr, src_stride_3x);
1244cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
1245cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr, 0);
1246cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
1247cabdff1aSopenharmony_ci                  src2_stride_2x, in1, in2);
1248cabdff1aSopenharmony_ci        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1249cabdff1aSopenharmony_ci        src1_ptr += src2_stride_2x;
1250cabdff1aSopenharmony_ci        in4 = __lsx_vld(_src1, 0);
1251cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
1252cabdff1aSopenharmony_ci                  in5, in6);
1253cabdff1aSopenharmony_ci        in7 = __lsx_vldx(_src1, src2_stride_3x);
1254cabdff1aSopenharmony_ci        _src1 += src2_stride_2x;
1255cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
1256cabdff1aSopenharmony_ci
1257cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
1258cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
1259cabdff1aSopenharmony_ci        src4332 = __lsx_vilvl_d(src43_l, src32_l);
1260cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src54_r, src65_r);
1261cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src54_l, src65_l);
1262cabdff1aSopenharmony_ci        src6554 = __lsx_vilvl_d(src65_l, src54_l);
1263cabdff1aSopenharmony_ci
1264cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src2110,
1265cabdff1aSopenharmony_ci                  filt0, src32_r, filt0, dst0_r, dst1_r, dst0_l, dst2_r);
1266cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, src43_r, filt0, src4332, filt0,
1267cabdff1aSopenharmony_ci                  dst3_r, dst1_l);
1268cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
1269cabdff1aSopenharmony_ci                  src43_r, filt1, dst0_l, src4332, filt1, dst2_r, src54_r,
1270cabdff1aSopenharmony_ci                  filt1, dst0_r, dst1_r, dst0_l, dst2_r);
1271cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst1_l,
1272cabdff1aSopenharmony_ci                  src6554, filt1, dst3_r, dst1_l);
1273cabdff1aSopenharmony_ci        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
1274cabdff1aSopenharmony_ci        dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
1275cabdff1aSopenharmony_ci        dst0_l = hevc_bi_rnd_clip(in4, dst0_l, in5, dst1_l);
1276cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_r, dst, 0, 0);
1277cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
1278cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
1279cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
1280cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0_l, dst, 8, 0);
1281cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0_l, dst + dst_stride, 8, 1);
1282cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0_l, dst + dst_stride_2x, 8, 2);
1283cabdff1aSopenharmony_ci        __lsx_vstelm_w(dst0_l, dst + dst_stride_3x, 8, 3);
1284cabdff1aSopenharmony_ci        dst += dst_stride_4x;
1285cabdff1aSopenharmony_ci
1286cabdff1aSopenharmony_ci        src2 = src6;
1287cabdff1aSopenharmony_ci        src10_r = src54_r;
1288cabdff1aSopenharmony_ci        src21_r = src65_r;
1289cabdff1aSopenharmony_ci        src2110 = src6554;
1290cabdff1aSopenharmony_ci    }
1291cabdff1aSopenharmony_ci}
1292cabdff1aSopenharmony_ci
1293cabdff1aSopenharmony_cistatic void hevc_vt_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1294cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1295cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1296cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1297cabdff1aSopenharmony_ci{
1298cabdff1aSopenharmony_ci    int32_t loop_cnt;
1299cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
1300cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
1301cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
1302cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5;
1303cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
1304cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src21_r, src43_r;
1305cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src21_l, src43_l;
1306cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst0_l, dst1_l;
1307cabdff1aSopenharmony_ci    __m128i filt0, filt1;
1308cabdff1aSopenharmony_ci
1309cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
1310cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1311cabdff1aSopenharmony_ci
1312cabdff1aSopenharmony_ci    src0 = __lsx_vld(src0_ptr, 0);
1313cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1314cabdff1aSopenharmony_ci              src1, src2);
1315cabdff1aSopenharmony_ci    src0_ptr += src_stride_3x;
1316cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1317cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
1318cabdff1aSopenharmony_ci
1319cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1320cabdff1aSopenharmony_ci        src3 = __lsx_vld(src0_ptr, 0);
1321cabdff1aSopenharmony_ci        src4 = __lsx_vldx(src0_ptr, src_stride);
1322cabdff1aSopenharmony_ci        src0_ptr += src_stride_2x;
1323cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1324cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1325cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1326cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1327cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
1328cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
1329cabdff1aSopenharmony_ci
1330cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
1331cabdff1aSopenharmony_ci                  filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
1332cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
1333cabdff1aSopenharmony_ci                  filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, filt1,
1334cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst0_l, dst1_l);
1335cabdff1aSopenharmony_ci
1336cabdff1aSopenharmony_ci        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
1337cabdff1aSopenharmony_ci        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
1338cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
1339cabdff1aSopenharmony_ci        __lsx_vstx(dst1_r, dst, dst_stride);
1340cabdff1aSopenharmony_ci        dst += dst_stride_2x;
1341cabdff1aSopenharmony_ci
1342cabdff1aSopenharmony_ci        src5 = __lsx_vld(src0_ptr, 0);
1343cabdff1aSopenharmony_ci        src2 = __lsx_vldx(src0_ptr, src_stride);
1344cabdff1aSopenharmony_ci        src0_ptr += src_stride_2x;
1345cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1346cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1347cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1348cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1349cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
1350cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
1351cabdff1aSopenharmony_ci
1352cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
1353cabdff1aSopenharmony_ci                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1354cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
1355cabdff1aSopenharmony_ci                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
1356cabdff1aSopenharmony_ci                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
1357cabdff1aSopenharmony_ci        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
1358cabdff1aSopenharmony_ci        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
1359cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
1360cabdff1aSopenharmony_ci        __lsx_vstx(dst1_r, dst, dst_stride);
1361cabdff1aSopenharmony_ci        dst += dst_stride_2x;
1362cabdff1aSopenharmony_ci    }
1363cabdff1aSopenharmony_ci}
1364cabdff1aSopenharmony_ci
1365cabdff1aSopenharmony_cistatic void hevc_vt_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1366cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1367cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1368cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1369cabdff1aSopenharmony_ci{
1370cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1371cabdff1aSopenharmony_ci    int32_t dst_stride_2x = dst_stride << 1;
1372cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5;
1373cabdff1aSopenharmony_ci    __m128i src6, src7, src8, src9, src10, src11;
1374cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5;
1375cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src76_r, src98_r;
1376cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src87_r, src109_r;
1377cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src21_l, src43_l;
1378cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1379cabdff1aSopenharmony_ci    __m128i dst0_l, dst1_l;
1380cabdff1aSopenharmony_ci    __m128i filt0, filt1;
1381cabdff1aSopenharmony_ci
1382cabdff1aSopenharmony_ci    src0_ptr -= src_stride;
1383cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
1384cabdff1aSopenharmony_ci
1385cabdff1aSopenharmony_ci    /* 16width */
1386cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src6);
1387cabdff1aSopenharmony_ci    src0_ptr += src_stride;
1388cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src1, src7);
1389cabdff1aSopenharmony_ci    src0_ptr += src_stride;
1390cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8);
1391cabdff1aSopenharmony_ci    src0_ptr += src_stride;
1392cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1393cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
1394cabdff1aSopenharmony_ci    /* 8width */
1395cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
1396cabdff1aSopenharmony_ci
1397cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1398cabdff1aSopenharmony_ci        /* 16width */
1399cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src3, src9);
1400cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1401cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src10);
1402cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1403cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1404cabdff1aSopenharmony_ci        in4 = __lsx_vld(src1_ptr, 32);
1405cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1406cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1407cabdff1aSopenharmony_ci        in5 = __lsx_vld(src1_ptr, 32);
1408cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1409cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
1410cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
1411cabdff1aSopenharmony_ci        /* 8width */
1412cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
1413cabdff1aSopenharmony_ci        /* 16width */
1414cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
1415cabdff1aSopenharmony_ci                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1416cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1,  dst0_l,
1417cabdff1aSopenharmony_ci                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, filt1,
1418cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
1419cabdff1aSopenharmony_ci        /* 8width */
1420cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
1421cabdff1aSopenharmony_ci                  dst2_r, dst3_r);
1422cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
1423cabdff1aSopenharmony_ci                  src109_r, filt1, dst2_r, dst3_r);
1424cabdff1aSopenharmony_ci        /* 16width */
1425cabdff1aSopenharmony_ci        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
1426cabdff1aSopenharmony_ci        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
1427cabdff1aSopenharmony_ci        dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
1428cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
1429cabdff1aSopenharmony_ci        __lsx_vstx(dst1_r, dst, dst_stride);
1430cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2_r, dst, 16, 0);
1431cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
1432cabdff1aSopenharmony_ci        dst += dst_stride_2x;
1433cabdff1aSopenharmony_ci
1434cabdff1aSopenharmony_ci        /* 16width */
1435cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vld, src0_ptr, 0, src1_ptr, 0, src1_ptr, 16, src1_ptr,
1436cabdff1aSopenharmony_ci                  32, src5, in0, in2, in4);
1437cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1438cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vld, src0_ptr, 16,  src1_ptr, 0, src1_ptr, 16, src1_ptr,
1439cabdff1aSopenharmony_ci                  32, src11, in1, in3, in5);
1440cabdff1aSopenharmony_ci        src1_ptr += src2_stride;
1441cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1442cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src0_ptr, 0,  src0_ptr, 16, src2, src8);
1443cabdff1aSopenharmony_ci        src0_ptr += src_stride;
1444cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
1445cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
1446cabdff1aSopenharmony_ci        /* 8width */
1447cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
1448cabdff1aSopenharmony_ci        /* 16width */
1449cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
1450cabdff1aSopenharmony_ci                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1451cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
1452cabdff1aSopenharmony_ci                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
1453cabdff1aSopenharmony_ci                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
1454cabdff1aSopenharmony_ci
1455cabdff1aSopenharmony_ci        /* 8width */
1456cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
1457cabdff1aSopenharmony_ci                  dst2_r, dst3_r);
1458cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b,  dst2_r, src76_r, filt1, dst3_r,
1459cabdff1aSopenharmony_ci                  src87_r, filt1, dst2_r, dst3_r);
1460cabdff1aSopenharmony_ci
1461cabdff1aSopenharmony_ci        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
1462cabdff1aSopenharmony_ci        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
1463cabdff1aSopenharmony_ci        dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
1464cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
1465cabdff1aSopenharmony_ci        __lsx_vstx(dst1_r, dst, dst_stride);
1466cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2_r, dst, 16, 0);
1467cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
1468cabdff1aSopenharmony_ci        dst += dst_stride_2x;
1469cabdff1aSopenharmony_ci    }
1470cabdff1aSopenharmony_ci}
1471cabdff1aSopenharmony_ci
1472cabdff1aSopenharmony_cistatic void hevc_vt_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1473cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
1474cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
1475cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1476cabdff1aSopenharmony_ci{
1477cabdff1aSopenharmony_ci    hevc_vt_4t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
1478cabdff1aSopenharmony_ci                       dst, dst_stride, filter, height);
1479cabdff1aSopenharmony_ci    hevc_vt_4t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1480cabdff1aSopenharmony_ci                       dst + 16, dst_stride, filter, height);
1481cabdff1aSopenharmony_ci}
1482cabdff1aSopenharmony_ci
1483cabdff1aSopenharmony_cistatic void hevc_hv_4t_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
1484cabdff1aSopenharmony_ci                              int16_t *src1_ptr, int32_t src2_stride,
1485cabdff1aSopenharmony_ci                              uint8_t *dst, int32_t dst_stride,
1486cabdff1aSopenharmony_ci                              const int8_t *filter_x, const int8_t *filter_y,
1487cabdff1aSopenharmony_ci                              int32_t height)
1488cabdff1aSopenharmony_ci{
1489cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1490cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
1491cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1492cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
1493cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 1);
1494cabdff1aSopenharmony_ci    int32_t src2_stride_4x = (src2_stride << 2);
1495cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1496cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1497cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride;
1498cabdff1aSopenharmony_ci    __m128i out0, out1;
1499cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6;
1500cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, mask1;
1501cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt_h0, filt_h1;
1502cabdff1aSopenharmony_ci    __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5;
1503cabdff1aSopenharmony_ci    __m128i dsth6, dsth7, dsth8, dsth9, dsth10;
1504cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1505cabdff1aSopenharmony_ci    __m128i dst4_r, dst5_r, dst6_r, dst7_r;
1506cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
1507cabdff1aSopenharmony_ci    __m128i reg0, reg1, reg2, reg3;
1508cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1509cabdff1aSopenharmony_ci
1510cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
1511cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1512cabdff1aSopenharmony_ci
1513cabdff1aSopenharmony_ci    filt_h1 = __lsx_vld(filter_y, 0);
1514cabdff1aSopenharmony_ci    filt_h1 = __lsx_vsllwil_h_b(filt_h1, 0);
1515cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filt_h1, 0, filt_h1, 1, filt_h0, filt_h1);
1516cabdff1aSopenharmony_ci
1517cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
1518cabdff1aSopenharmony_ci
1519cabdff1aSopenharmony_ci    src0 = __lsx_vld(src0_ptr, 0);
1520cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1521cabdff1aSopenharmony_ci              src1, src2);
1522cabdff1aSopenharmony_ci    src0_ptr += src_stride_3x;
1523cabdff1aSopenharmony_ci
1524cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1525cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1526cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1527cabdff1aSopenharmony_ci
1528cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
1529cabdff1aSopenharmony_ci    dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1530cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
1531cabdff1aSopenharmony_ci              dsth0, dsth1);
1532cabdff1aSopenharmony_ci    dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
1533cabdff1aSopenharmony_ci
1534cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, tmp0, tmp2);
1535cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, tmp1, tmp3);
1536cabdff1aSopenharmony_ci
1537cabdff1aSopenharmony_ci    src3 = __lsx_vld(src0_ptr, 0);
1538cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1539cabdff1aSopenharmony_ci              src4, src5);
1540cabdff1aSopenharmony_ci    src6 = __lsx_vldx(src0_ptr, src_stride_3x);
1541cabdff1aSopenharmony_ci    src0_ptr += src_stride_4x;
1542cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
1543cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
1544cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
1545cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
1546cabdff1aSopenharmony_ci
1547cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1548cabdff1aSopenharmony_ci              filt0, dsth3, dsth4, dsth5, dsth6);
1549cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, vec3, filt1, dsth5,
1550cabdff1aSopenharmony_ci              vec5, filt1, dsth6, vec7, filt1, dsth3, dsth4, dsth5, dsth6);
1551cabdff1aSopenharmony_ci
1552cabdff1aSopenharmony_ci    src3 = __lsx_vld(src0_ptr, 0);
1553cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1554cabdff1aSopenharmony_ci              src4, src5);
1555cabdff1aSopenharmony_ci    src6 = __lsx_vldx(src0_ptr, src_stride_3x);
1556cabdff1aSopenharmony_ci
1557cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
1558cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
1559cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
1560cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
1561cabdff1aSopenharmony_ci
1562cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1563cabdff1aSopenharmony_ci              filt0, dsth7, dsth8, dsth9, dsth10);
1564cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth7, vec1, filt1, dsth8, vec3, filt1, dsth9,
1565cabdff1aSopenharmony_ci              vec5, filt1, dsth10, vec7, filt1, dsth7, dsth8, dsth9, dsth10);
1566cabdff1aSopenharmony_ci
1567cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, tmp4, tmp6);
1568cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, tmp5, tmp7);
1569cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dsth5, dsth4, dsth6, dsth5, dsth0, dsth2);
1570cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dsth5, dsth4, dsth6, dsth5, dsth1, dsth3);
1571cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, tmp0, filt_h0, tmp2, filt_h0, tmp4, filt_h0,
1572cabdff1aSopenharmony_ci              tmp6, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
1573cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, tmp4, filt_h1, dst1_r, tmp6,
1574cabdff1aSopenharmony_ci              filt_h1, dst2_r, dsth0, filt_h1, dst3_r, dsth2, filt_h1,
1575cabdff1aSopenharmony_ci              dst0_r, dst1_r, dst2_r, dst3_r);
1576cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_d, tmp3, tmp1, tmp7, tmp5, tmp0, tmp8);
1577cabdff1aSopenharmony_ci    dst0_l = __lsx_vdp2_w_h(tmp0, filt_h0);
1578cabdff1aSopenharmony_ci    dst0_l = __lsx_vdp2add_w_h(dst0_l, tmp8, filt_h1);
1579cabdff1aSopenharmony_ci
1580cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dsth7, dsth6, dsth8, dsth7, tmp0, tmp2);
1581cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dsth7, dsth6, dsth8, dsth7, tmp1, tmp3);
1582cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dsth9, dsth8, dsth10, dsth9, tmp4, tmp6);
1583cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dsth9, dsth8, dsth10, dsth9, tmp5, tmp7);
1584cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dsth0, filt_h0, dsth2, filt_h0, tmp0, filt_h0,
1585cabdff1aSopenharmony_ci              tmp2, filt_h0, dst4_r, dst5_r, dst6_r, dst7_r);
1586cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, tmp0, filt_h1, dst5_r, tmp2,
1587cabdff1aSopenharmony_ci              filt_h1, dst6_r, tmp4, filt_h1, dst7_r, tmp6, filt_h1,
1588cabdff1aSopenharmony_ci              dst4_r, dst5_r, dst6_r, dst7_r);
1589cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_d, dsth3, dsth1, tmp3, tmp1, tmp0, tmp1);
1590cabdff1aSopenharmony_ci    tmp2 = __lsx_vpickev_d(tmp7, tmp5);
1591cabdff1aSopenharmony_ci
1592cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_w_h, tmp8, filt_h0, tmp0, filt_h0, dst1_l, dst2_l);
1593cabdff1aSopenharmony_ci    dst3_l = __lsx_vdp2_w_h(tmp1, filt_h0);
1594cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_w_h, dst1_l, tmp0, filt_h1, dst2_l, tmp1, filt_h1,
1595cabdff1aSopenharmony_ci              dst1_l, dst2_l);
1596cabdff1aSopenharmony_ci    dst3_l = __lsx_vdp2add_w_h(dst3_l, tmp2, filt_h1);
1597cabdff1aSopenharmony_ci
1598cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_d, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
1599cabdff1aSopenharmony_ci              dst0_r, dst1_r, dst2_r, dst3_r);
1600cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_d, dst4_r, 6, dst5_r, 6, dst6_r, 6, dst7_r, 6,
1601cabdff1aSopenharmony_ci              dst4_r, dst5_r, dst6_r, dst7_r);
1602cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_d, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
1603cabdff1aSopenharmony_ci              dst0_l, dst1_l, dst2_l, dst3_l);
1604cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
1605cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
1606cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
1607cabdff1aSopenharmony_ci
1608cabdff1aSopenharmony_ci    reg0 = __lsx_vldrepl_d(src1_ptr, 0);
1609cabdff1aSopenharmony_ci    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
1610cabdff1aSopenharmony_ci    dsth0 = __lsx_vilvl_d(reg1, reg0);
1611cabdff1aSopenharmony_ci    reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
1612cabdff1aSopenharmony_ci    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
1613cabdff1aSopenharmony_ci    dsth1 = __lsx_vilvl_d(reg1, reg0);
1614cabdff1aSopenharmony_ci    src1_ptr += src2_stride_4x;
1615cabdff1aSopenharmony_ci    reg0 = __lsx_vldrepl_d(src1_ptr, 0);
1616cabdff1aSopenharmony_ci    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
1617cabdff1aSopenharmony_ci    dsth2 = __lsx_vilvl_d(reg1, reg0);
1618cabdff1aSopenharmony_ci    reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
1619cabdff1aSopenharmony_ci    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
1620cabdff1aSopenharmony_ci    dsth3 = __lsx_vilvl_d(reg1, reg0);
1621cabdff1aSopenharmony_ci
1622cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsadd_h, dsth0, tmp0, dsth1, tmp1, dsth2, tmp2, dsth3,
1623cabdff1aSopenharmony_ci              tmp3, tmp0, tmp1, tmp2, tmp3);
1624cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
1625cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
1626cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1627cabdff1aSopenharmony_ci
1628cabdff1aSopenharmony_ci    __lsx_vstelm_w(out0, dst, 0, 0);
1629cabdff1aSopenharmony_ci    __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
1630cabdff1aSopenharmony_ci    __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
1631cabdff1aSopenharmony_ci    __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
1632cabdff1aSopenharmony_ci    dst += dst_stride_4x;
1633cabdff1aSopenharmony_ci    __lsx_vstelm_w(out1, dst, 0, 0);
1634cabdff1aSopenharmony_ci    __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
1635cabdff1aSopenharmony_ci    __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
1636cabdff1aSopenharmony_ci    __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
1637cabdff1aSopenharmony_ci    dst -= dst_stride_4x;
1638cabdff1aSopenharmony_ci
1639cabdff1aSopenharmony_ci    src1_ptr -= src2_stride_4x;
1640cabdff1aSopenharmony_ci
1641cabdff1aSopenharmony_ci    reg0 = __lsx_vldrepl_w(src1_ptr, 8);
1642cabdff1aSopenharmony_ci    reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
1643cabdff1aSopenharmony_ci    reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
1644cabdff1aSopenharmony_ci    reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
1645cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
1646cabdff1aSopenharmony_ci    dsth4 = __lsx_vilvl_d(tmp1, tmp0);
1647cabdff1aSopenharmony_ci    src1_ptr += src2_stride_4x;
1648cabdff1aSopenharmony_ci
1649cabdff1aSopenharmony_ci    reg0 = __lsx_vldrepl_w(src1_ptr, 8);
1650cabdff1aSopenharmony_ci    reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
1651cabdff1aSopenharmony_ci    reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
1652cabdff1aSopenharmony_ci    reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
1653cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
1654cabdff1aSopenharmony_ci    dsth5 = __lsx_vilvl_d(tmp1, tmp0);
1655cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsadd_h, dsth4, tmp4, dsth5, tmp5, tmp4, tmp5);
1656cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 7, tmp4, tmp5);
1657cabdff1aSopenharmony_ci    out0 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
1658cabdff1aSopenharmony_ci
1659cabdff1aSopenharmony_ci    __lsx_vstelm_h(out0, dst, 4, 0);
1660cabdff1aSopenharmony_ci    __lsx_vstelm_h(out0, dst + dst_stride, 4, 1);
1661cabdff1aSopenharmony_ci    __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 2);
1662cabdff1aSopenharmony_ci    __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 3);
1663cabdff1aSopenharmony_ci    dst += dst_stride_4x;
1664cabdff1aSopenharmony_ci    __lsx_vstelm_h(out0, dst, 4, 4);
1665cabdff1aSopenharmony_ci    __lsx_vstelm_h(out0, dst + dst_stride, 4, 5);
1666cabdff1aSopenharmony_ci    __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 6);
1667cabdff1aSopenharmony_ci    __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 7);
1668cabdff1aSopenharmony_ci}
1669cabdff1aSopenharmony_ci
1670cabdff1aSopenharmony_cistatic av_always_inline
1671cabdff1aSopenharmony_civoid hevc_hv_4t_8x2_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
1672cabdff1aSopenharmony_ci                        int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
1673cabdff1aSopenharmony_ci                        const int8_t *filter_x, const int8_t *filter_y)
1674cabdff1aSopenharmony_ci{
1675cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1676cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1677cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1678cabdff1aSopenharmony_ci
1679cabdff1aSopenharmony_ci    __m128i out;
1680cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4;
1681cabdff1aSopenharmony_ci    __m128i filt0, filt1;
1682cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1;
1683cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1684cabdff1aSopenharmony_ci    __m128i mask1, filter_vec;
1685cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1686cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4;
1687cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
1688cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
1689cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
1690cabdff1aSopenharmony_ci    __m128i tmp0, tmp1;
1691cabdff1aSopenharmony_ci    __m128i in0, in1;
1692cabdff1aSopenharmony_ci
1693cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
1694cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1695cabdff1aSopenharmony_ci
1696cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
1697cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1698cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1699cabdff1aSopenharmony_ci
1700cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
1701cabdff1aSopenharmony_ci
1702cabdff1aSopenharmony_ci    src0 = __lsx_vld(src0_ptr, 0);
1703cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1704cabdff1aSopenharmony_ci              src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
1705cabdff1aSopenharmony_ci              src1, src2, src3, src4);
1706cabdff1aSopenharmony_ci
1707cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr + src2_stride, 0, in0, in1);
1708cabdff1aSopenharmony_ci
1709cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1710cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1711cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1712cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
1713cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
1714cabdff1aSopenharmony_ci
1715cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1716cabdff1aSopenharmony_ci              filt0, dst0, dst1, dst2, dst3);
1717cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
1718cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
1719cabdff1aSopenharmony_ci              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
1720cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
1721cabdff1aSopenharmony_ci
1722cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1723cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1724cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
1725cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
1726cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1727cabdff1aSopenharmony_ci              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1728cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1729cabdff1aSopenharmony_ci              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1730cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
1731cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1732cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
1733cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
1734cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, tmp0, tmp1);
1735cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp0, tmp1);
1736cabdff1aSopenharmony_ci    out = __lsx_vssrlrni_bu_h(tmp1, tmp0, 7);
1737cabdff1aSopenharmony_ci    __lsx_vstelm_d(out, dst, 0, 0);
1738cabdff1aSopenharmony_ci    __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
1739cabdff1aSopenharmony_ci}
1740cabdff1aSopenharmony_ci
1741cabdff1aSopenharmony_cistatic av_always_inline
1742cabdff1aSopenharmony_civoid hevc_hv_4t_8multx4_lsx(uint8_t *src0_ptr, int32_t src_stride,
1743cabdff1aSopenharmony_ci                            int16_t *src1_ptr, int32_t src2_stride,
1744cabdff1aSopenharmony_ci                            uint8_t *dst, int32_t dst_stride,
1745cabdff1aSopenharmony_ci                            const int8_t *filter_x, const int8_t *filter_y,
1746cabdff1aSopenharmony_ci                            int32_t width8mult)
1747cabdff1aSopenharmony_ci{
1748cabdff1aSopenharmony_ci    uint32_t cnt;
1749cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1750cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
1751cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1752cabdff1aSopenharmony_ci    int32_t src2_stride_x = (src2_stride << 1);
1753cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 2);
1754cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1755cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1756cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1757cabdff1aSopenharmony_ci
1758cabdff1aSopenharmony_ci    __m128i out0, out1;
1759cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
1760cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1761cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
1762cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
1763cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
1764cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1765cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
1766cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
1767cabdff1aSopenharmony_ci
1768cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
1769cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1770cabdff1aSopenharmony_ci
1771cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
1772cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1773cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1774cabdff1aSopenharmony_ci
1775cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1776cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
1777cabdff1aSopenharmony_ci
1778cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
1779cabdff1aSopenharmony_ci        src0 = __lsx_vld(src0_ptr, 0);
1780cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1781cabdff1aSopenharmony_ci                  src1, src2);
1782cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
1783cabdff1aSopenharmony_ci        src0_ptr += src_stride_4x;
1784cabdff1aSopenharmony_ci        src4 = __lsx_vld(src0_ptr, 0);
1785cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1786cabdff1aSopenharmony_ci                  src5, src6);
1787cabdff1aSopenharmony_ci        src0_ptr += (8 - src_stride_4x);
1788cabdff1aSopenharmony_ci
1789cabdff1aSopenharmony_ci        in0 = __lsx_vld(src1_ptr, 0);
1790cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
1791cabdff1aSopenharmony_ci                  src2_stride_2x, in1, in2);
1792cabdff1aSopenharmony_ci        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1793cabdff1aSopenharmony_ci        src1_ptr += 8;
1794cabdff1aSopenharmony_ci
1795cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
1796cabdff1aSopenharmony_ci                  vec0, vec1);
1797cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
1798cabdff1aSopenharmony_ci                  vec2, vec3);
1799cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
1800cabdff1aSopenharmony_ci                  vec4, vec5);
1801cabdff1aSopenharmony_ci
1802cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
1803cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1804cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
1805cabdff1aSopenharmony_ci                  dst0, dst1);
1806cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
1807cabdff1aSopenharmony_ci
1808cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1809cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1810cabdff1aSopenharmony_ci
1811cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
1812cabdff1aSopenharmony_ci                  vec0, vec1);
1813cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
1814cabdff1aSopenharmony_ci                  vec2, vec3);
1815cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
1816cabdff1aSopenharmony_ci                  vec4, vec5);
1817cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
1818cabdff1aSopenharmony_ci                  vec6, vec7);
1819cabdff1aSopenharmony_ci
1820cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1821cabdff1aSopenharmony_ci                  vec6, filt0, dst3, dst4, dst5, dst6);
1822cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
1823cabdff1aSopenharmony_ci                  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
1824cabdff1aSopenharmony_ci
1825cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
1826cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
1827cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
1828cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
1829cabdff1aSopenharmony_ci
1830cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1831cabdff1aSopenharmony_ci                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1832cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1833cabdff1aSopenharmony_ci                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1834cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1835cabdff1aSopenharmony_ci                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1836cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
1837cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1838cabdff1aSopenharmony_ci                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1839cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
1840cabdff1aSopenharmony_ci
1841cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1842cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
1843cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
1844cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
1845cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
1846cabdff1aSopenharmony_ci                  dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
1847cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
1848cabdff1aSopenharmony_ci                  tmp0, tmp1, tmp2, tmp3);
1849cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
1850cabdff1aSopenharmony_ci                  tmp0, tmp1, tmp2, tmp3);
1851cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1852cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst, 0, 0);
1853cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1854cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
1855cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
1856cabdff1aSopenharmony_ci        dst += 8;
1857cabdff1aSopenharmony_ci    }
1858cabdff1aSopenharmony_ci}
1859cabdff1aSopenharmony_ci
1860cabdff1aSopenharmony_cistatic av_always_inline
1861cabdff1aSopenharmony_civoid hevc_hv_4t_8x6_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
1862cabdff1aSopenharmony_ci                        int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
1863cabdff1aSopenharmony_ci                        const int8_t *filter_x, const int8_t *filter_y)
1864cabdff1aSopenharmony_ci{
1865cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1866cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
1867cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1868cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
1869cabdff1aSopenharmony_ci    int32_t src2_stride_x = (src2_stride << 1);
1870cabdff1aSopenharmony_ci    int32_t src2_stride_2x = (src2_stride << 2);
1871cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1872cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1873cabdff1aSopenharmony_ci    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1874cabdff1aSopenharmony_ci
1875cabdff1aSopenharmony_ci    __m128i out0, out1, out2;
1876cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
1877cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5;
1878cabdff1aSopenharmony_ci    __m128i filt0, filt1;
1879cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1;
1880cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1881cabdff1aSopenharmony_ci    __m128i mask1, filter_vec;
1882cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1883cabdff1aSopenharmony_ci    __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
1884cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1885cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1886cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1887cabdff1aSopenharmony_ci    __m128i dst4_r, dst4_l, dst5_r, dst5_l;
1888cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst10_l, dst32_l;
1889cabdff1aSopenharmony_ci    __m128i dst21_r, dst43_r, dst21_l, dst43_l;
1890cabdff1aSopenharmony_ci    __m128i dst54_r, dst54_l, dst65_r, dst65_l;
1891cabdff1aSopenharmony_ci    __m128i dst76_r, dst76_l, dst87_r, dst87_l;
1892cabdff1aSopenharmony_ci
1893cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
1894cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1895cabdff1aSopenharmony_ci
1896cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
1897cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1898cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1899cabdff1aSopenharmony_ci
1900cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
1901cabdff1aSopenharmony_ci
1902cabdff1aSopenharmony_ci    src0 = __lsx_vld(src0_ptr, 0);
1903cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1904cabdff1aSopenharmony_ci              src1, src2);
1905cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src0_ptr, src_stride_3x);
1906cabdff1aSopenharmony_ci    src0_ptr += src_stride_4x;
1907cabdff1aSopenharmony_ci    src4 = __lsx_vld(src0_ptr, 0);
1908cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1909cabdff1aSopenharmony_ci              src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
1910cabdff1aSopenharmony_ci              src5, src6, src7, src8);
1911cabdff1aSopenharmony_ci
1912cabdff1aSopenharmony_ci    in0 = __lsx_vld(src1_ptr, 0);
1913cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
1914cabdff1aSopenharmony_ci              in1, in2);
1915cabdff1aSopenharmony_ci    in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1916cabdff1aSopenharmony_ci    src1_ptr += src2_stride_2x;
1917cabdff1aSopenharmony_ci    in4 = __lsx_vld(src1_ptr, 0);
1918cabdff1aSopenharmony_ci    in5 = __lsx_vldx(src1_ptr, src2_stride_x);
1919cabdff1aSopenharmony_ci
1920cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1921cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1922cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1923cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
1924cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
1925cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec10, vec11);
1926cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec12, vec13);
1927cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, vec14, vec15);
1928cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
1929cabdff1aSopenharmony_ci
1930cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1931cabdff1aSopenharmony_ci              filt0, dst0, dst1, dst2, dst3);
1932cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
1933cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec10, filt0, vec12, filt0, vec14, filt0,
1934cabdff1aSopenharmony_ci              vec16, filt0, dst5, dst6, dst7, dst8);
1935cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
1936cabdff1aSopenharmony_ci              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
1937cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
1938cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec11, filt1, dst6, vec13, filt1,
1939cabdff1aSopenharmony_ci              dst7, vec15, filt1, dst8, vec17, filt1, dst5, dst6, dst7, dst8);
1940cabdff1aSopenharmony_ci
1941cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
1942cabdff1aSopenharmony_ci              dst10_r, dst21_r, dst32_r, dst43_r);
1943cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
1944cabdff1aSopenharmony_ci              dst10_l, dst21_l, dst32_l, dst43_l);
1945cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
1946cabdff1aSopenharmony_ci              dst54_r, dst65_r, dst76_r, dst87_r);
1947cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
1948cabdff1aSopenharmony_ci              dst54_l, dst65_l, dst76_l, dst87_l);
1949cabdff1aSopenharmony_ci
1950cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1951cabdff1aSopenharmony_ci              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1952cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1953cabdff1aSopenharmony_ci              filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1954cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
1955cabdff1aSopenharmony_ci              filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
1956cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1957cabdff1aSopenharmony_ci              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1958cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
1959cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1960cabdff1aSopenharmony_ci              filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1961cabdff1aSopenharmony_ci              dst2_r, dst2_l, dst3_r, dst3_l);
1962cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
1963cabdff1aSopenharmony_ci              filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
1964cabdff1aSopenharmony_ci              dst4_r, dst4_l, dst5_r, dst5_l);
1965cabdff1aSopenharmony_ci
1966cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1967cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
1968cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
1969cabdff1aSopenharmony_ci              dst2_r, dst2_l, dst3_r, dst3_l);
1970cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6,
1971cabdff1aSopenharmony_ci              dst4_r, dst4_l, dst5_r, dst5_l);
1972cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
1973cabdff1aSopenharmony_ci              dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
1974cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
1975cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
1976cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
1977cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsadd_h, in4, tmp4, in5, tmp5, tmp4, tmp5);
1978cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
1979cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
1980cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 0, tmp4, tmp5);
1981cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1982cabdff1aSopenharmony_ci    out2 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
1983cabdff1aSopenharmony_ci    __lsx_vstelm_d(out0, dst, 0, 0);
1984cabdff1aSopenharmony_ci    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1985cabdff1aSopenharmony_ci    __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
1986cabdff1aSopenharmony_ci    __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
1987cabdff1aSopenharmony_ci    dst += dst_stride_4x;
1988cabdff1aSopenharmony_ci    __lsx_vstelm_d(out2, dst, 0, 0);
1989cabdff1aSopenharmony_ci    __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
1990cabdff1aSopenharmony_ci}
1991cabdff1aSopenharmony_ci
1992cabdff1aSopenharmony_cistatic av_always_inline
1993cabdff1aSopenharmony_civoid hevc_hv_4t_8multx4mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
1994cabdff1aSopenharmony_ci                                int16_t *src1_ptr, int32_t src2_stride,
1995cabdff1aSopenharmony_ci                                uint8_t *dst, int32_t dst_stride,
1996cabdff1aSopenharmony_ci                                const int8_t *filter_x, const int8_t *filter_y,
1997cabdff1aSopenharmony_ci                                int32_t height, int32_t width)
1998cabdff1aSopenharmony_ci{
1999cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
2000cabdff1aSopenharmony_ci    uint8_t *src0_ptr_tmp;
2001cabdff1aSopenharmony_ci    int16_t *src1_ptr_tmp;
2002cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
2003cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
2004cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
2005cabdff1aSopenharmony_ci    const int32_t src_stride_4x = (src_stride << 2);
2006cabdff1aSopenharmony_ci    const int32_t dst_stride_4x = (dst_stride << 2);
2007cabdff1aSopenharmony_ci    const int32_t src2_stride_x = (src2_stride << 1);
2008cabdff1aSopenharmony_ci    const int32_t src2_stride_2x = (src2_stride << 2);
2009cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
2010cabdff1aSopenharmony_ci    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2011cabdff1aSopenharmony_ci    const int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
2012cabdff1aSopenharmony_ci    __m128i out0, out1;
2013cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6;
2014cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
2015cabdff1aSopenharmony_ci    __m128i filt0, filt1;
2016cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1;
2017cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
2018cabdff1aSopenharmony_ci    __m128i mask1, filter_vec;
2019cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2020cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
2021cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2022cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3;
2023cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
2024cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
2025cabdff1aSopenharmony_ci    __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
2026cabdff1aSopenharmony_ci
2027cabdff1aSopenharmony_ci    src0_ptr -= (src_stride + 1);
2028cabdff1aSopenharmony_ci
2029cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2030cabdff1aSopenharmony_ci
2031cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
2032cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2033cabdff1aSopenharmony_ci
2034cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2035cabdff1aSopenharmony_ci
2036cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
2037cabdff1aSopenharmony_ci
2038cabdff1aSopenharmony_ci    for (cnt = width >> 3; cnt--;) {
2039cabdff1aSopenharmony_ci        src0_ptr_tmp = src0_ptr;
2040cabdff1aSopenharmony_ci        dst_tmp = dst;
2041cabdff1aSopenharmony_ci        src1_ptr_tmp = src1_ptr;
2042cabdff1aSopenharmony_ci
2043cabdff1aSopenharmony_ci        src0 = __lsx_vld(src0_ptr_tmp, 0);
2044cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
2045cabdff1aSopenharmony_ci                  src_stride_2x, src1, src2);
2046cabdff1aSopenharmony_ci        src0_ptr_tmp += src_stride_3x;
2047cabdff1aSopenharmony_ci
2048cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
2049cabdff1aSopenharmony_ci                  vec0, vec1);
2050cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
2051cabdff1aSopenharmony_ci                  vec2, vec3);
2052cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
2053cabdff1aSopenharmony_ci                  vec4, vec5);
2054cabdff1aSopenharmony_ci
2055cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
2056cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
2057cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2058cabdff1aSopenharmony_ci                  dst0, dst1);
2059cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
2060cabdff1aSopenharmony_ci
2061cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2062cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2063cabdff1aSopenharmony_ci
2064cabdff1aSopenharmony_ci        for (loop_cnt = height >> 2; loop_cnt--;) {
2065cabdff1aSopenharmony_ci            src3 = __lsx_vld(src0_ptr_tmp, 0);
2066cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
2067cabdff1aSopenharmony_ci                      src_stride_2x, src4, src5);
2068cabdff1aSopenharmony_ci            src6 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
2069cabdff1aSopenharmony_ci            src0_ptr_tmp += src_stride_4x;
2070cabdff1aSopenharmony_ci            in0 = __lsx_vld(src1_ptr_tmp, 0);
2071cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
2072cabdff1aSopenharmony_ci                      src2_stride_2x, in1, in2);
2073cabdff1aSopenharmony_ci            in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
2074cabdff1aSopenharmony_ci            src1_ptr_tmp += src2_stride_2x;
2075cabdff1aSopenharmony_ci
2076cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
2077cabdff1aSopenharmony_ci                      src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
2078cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
2079cabdff1aSopenharmony_ci                      src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
2080cabdff1aSopenharmony_ci
2081cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
2082cabdff1aSopenharmony_ci                      vec6, filt0, dst3, dst4, dst5, dst6);
2083cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
2084cabdff1aSopenharmony_ci                      filt1, dst5, vec5, filt1, dst6, vec7, filt1,
2085cabdff1aSopenharmony_ci                      dst3, dst4, dst5, dst6);
2086cabdff1aSopenharmony_ci
2087cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
2088cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
2089cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
2090cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
2091cabdff1aSopenharmony_ci
2092cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2093cabdff1aSopenharmony_ci                      filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2094cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
2095cabdff1aSopenharmony_ci                      filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
2096cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
2097cabdff1aSopenharmony_ci                      dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
2098cabdff1aSopenharmony_ci                      dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
2099cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
2100cabdff1aSopenharmony_ci                      dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
2101cabdff1aSopenharmony_ci                      dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
2102cabdff1aSopenharmony_ci
2103cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2104cabdff1aSopenharmony_ci                      dst0_r, dst0_l, dst1_r, dst1_l);
2105cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
2106cabdff1aSopenharmony_ci                      dst2_r, dst2_l, dst3_r, dst3_l);
2107cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
2108cabdff1aSopenharmony_ci                      dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
2109cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
2110cabdff1aSopenharmony_ci                      tmp0, tmp1, tmp2, tmp3);
2111cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, tmp0,
2112cabdff1aSopenharmony_ci                      tmp1, tmp2, tmp3);
2113cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
2114cabdff1aSopenharmony_ci            __lsx_vstelm_d(out0, dst_tmp, 0, 0);
2115cabdff1aSopenharmony_ci            __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
2116cabdff1aSopenharmony_ci            __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
2117cabdff1aSopenharmony_ci            __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
2118cabdff1aSopenharmony_ci            dst_tmp += dst_stride_4x;
2119cabdff1aSopenharmony_ci
2120cabdff1aSopenharmony_ci            dst10_r = dst54_r;
2121cabdff1aSopenharmony_ci            dst10_l = dst54_l;
2122cabdff1aSopenharmony_ci            dst21_r = dst65_r;
2123cabdff1aSopenharmony_ci            dst21_l = dst65_l;
2124cabdff1aSopenharmony_ci            dst2 = dst6;
2125cabdff1aSopenharmony_ci        }
2126cabdff1aSopenharmony_ci
2127cabdff1aSopenharmony_ci        src0_ptr += 8;
2128cabdff1aSopenharmony_ci        dst += 8;
2129cabdff1aSopenharmony_ci        src1_ptr += 8;
2130cabdff1aSopenharmony_ci    }
2131cabdff1aSopenharmony_ci}
2132cabdff1aSopenharmony_ci
2133cabdff1aSopenharmony_cistatic void hevc_hv_4t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
2134cabdff1aSopenharmony_ci                              int16_t *src1_ptr, int32_t src2_stride,
2135cabdff1aSopenharmony_ci                              uint8_t *dst, int32_t dst_stride,
2136cabdff1aSopenharmony_ci                              const int8_t *filter_x, const int8_t *filter_y,
2137cabdff1aSopenharmony_ci                              int32_t height)
2138cabdff1aSopenharmony_ci{
2139cabdff1aSopenharmony_ci    if (2 == height) {
2140cabdff1aSopenharmony_ci        hevc_hv_4t_8x2_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2141cabdff1aSopenharmony_ci                           dst, dst_stride, filter_x, filter_y);
2142cabdff1aSopenharmony_ci    } else if (4 == height) {
2143cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2144cabdff1aSopenharmony_ci                               dst, dst_stride, filter_x, filter_y, 1);
2145cabdff1aSopenharmony_ci    } else if (6 == height) {
2146cabdff1aSopenharmony_ci        hevc_hv_4t_8x6_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2147cabdff1aSopenharmony_ci                           dst, dst_stride, filter_x, filter_y);
2148cabdff1aSopenharmony_ci    } else {
2149cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2150cabdff1aSopenharmony_ci                                dst, dst_stride, filter_x, filter_y, height, 8);
2151cabdff1aSopenharmony_ci    }
2152cabdff1aSopenharmony_ci}
2153cabdff1aSopenharmony_ci
2154cabdff1aSopenharmony_cistatic void hevc_hv_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
2155cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
2156cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
2157cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
2158cabdff1aSopenharmony_ci                               int32_t height)
2159cabdff1aSopenharmony_ci{
2160cabdff1aSopenharmony_ci    if (4 == height) {
2161cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2162cabdff1aSopenharmony_ci                               dst, dst_stride, filter_x, filter_y, 2);
2163cabdff1aSopenharmony_ci    } else {
2164cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2165cabdff1aSopenharmony_ci                                dst, dst_stride, filter_x, filter_y, height, 16);
2166cabdff1aSopenharmony_ci    }
2167cabdff1aSopenharmony_ci}
2168cabdff1aSopenharmony_ci
2169cabdff1aSopenharmony_cistatic void hevc_hv_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
2170cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
2171cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
2172cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
2173cabdff1aSopenharmony_ci                               int32_t height)
2174cabdff1aSopenharmony_ci{
2175cabdff1aSopenharmony_ci    hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2176cabdff1aSopenharmony_ci                            dst, dst_stride, filter_x, filter_y, height, 24);
2177cabdff1aSopenharmony_ci}
2178cabdff1aSopenharmony_ci
2179cabdff1aSopenharmony_cistatic void hevc_hv_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
2180cabdff1aSopenharmony_ci                               int16_t *src1_ptr, int32_t src2_stride,
2181cabdff1aSopenharmony_ci                               uint8_t *dst, int32_t dst_stride,
2182cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
2183cabdff1aSopenharmony_ci                               int32_t height)
2184cabdff1aSopenharmony_ci{
2185cabdff1aSopenharmony_ci    hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
2186cabdff1aSopenharmony_ci                            dst, dst_stride, filter_x, filter_y, height, 32);
2187cabdff1aSopenharmony_ci}
2188cabdff1aSopenharmony_ci
2189cabdff1aSopenharmony_ci#define BI_MC_COPY(WIDTH)                                                 \
2190cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_lsx(uint8_t *dst,          \
2191cabdff1aSopenharmony_ci                                                   ptrdiff_t dst_stride,  \
2192cabdff1aSopenharmony_ci                                                   uint8_t *src,          \
2193cabdff1aSopenharmony_ci                                                   ptrdiff_t src_stride,  \
2194cabdff1aSopenharmony_ci                                                   int16_t *src_16bit,    \
2195cabdff1aSopenharmony_ci                                                   int height,            \
2196cabdff1aSopenharmony_ci                                                   intptr_t mx,           \
2197cabdff1aSopenharmony_ci                                                   intptr_t my,           \
2198cabdff1aSopenharmony_ci                                                   int width)             \
2199cabdff1aSopenharmony_ci{                                                                         \
2200cabdff1aSopenharmony_ci    hevc_bi_copy_##WIDTH##w_lsx(src, src_stride, src_16bit, MAX_PB_SIZE,  \
2201cabdff1aSopenharmony_ci                                dst, dst_stride, height);                 \
2202cabdff1aSopenharmony_ci}
2203cabdff1aSopenharmony_ci
2204cabdff1aSopenharmony_ciBI_MC_COPY(4);
2205cabdff1aSopenharmony_ciBI_MC_COPY(6);
2206cabdff1aSopenharmony_ciBI_MC_COPY(8);
2207cabdff1aSopenharmony_ciBI_MC_COPY(12);
2208cabdff1aSopenharmony_ciBI_MC_COPY(16);
2209cabdff1aSopenharmony_ciBI_MC_COPY(24);
2210cabdff1aSopenharmony_ciBI_MC_COPY(32);
2211cabdff1aSopenharmony_ciBI_MC_COPY(48);
2212cabdff1aSopenharmony_ciBI_MC_COPY(64);
2213cabdff1aSopenharmony_ci
2214cabdff1aSopenharmony_ci#undef BI_MC_COPY
2215cabdff1aSopenharmony_ci
2216cabdff1aSopenharmony_ci#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
2217cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,          \
2218cabdff1aSopenharmony_ci                                                      ptrdiff_t dst_stride,  \
2219cabdff1aSopenharmony_ci                                                      uint8_t *src,          \
2220cabdff1aSopenharmony_ci                                                      ptrdiff_t src_stride,  \
2221cabdff1aSopenharmony_ci                                                      int16_t *src_16bit,    \
2222cabdff1aSopenharmony_ci                                                      int height,            \
2223cabdff1aSopenharmony_ci                                                      intptr_t mx,           \
2224cabdff1aSopenharmony_ci                                                      intptr_t my,           \
2225cabdff1aSopenharmony_ci                                                      int width)             \
2226cabdff1aSopenharmony_ci{                                                                            \
2227cabdff1aSopenharmony_ci    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];            \
2228cabdff1aSopenharmony_ci                                                                             \
2229cabdff1aSopenharmony_ci    hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit,        \
2230cabdff1aSopenharmony_ci                                          MAX_PB_SIZE, dst, dst_stride,      \
2231cabdff1aSopenharmony_ci                                          filter, height);                   \
2232cabdff1aSopenharmony_ci}
2233cabdff1aSopenharmony_ci
2234cabdff1aSopenharmony_ciBI_MC(qpel, h, 16, 8, hz, mx);
2235cabdff1aSopenharmony_ciBI_MC(qpel, h, 24, 8, hz, mx);
2236cabdff1aSopenharmony_ciBI_MC(qpel, h, 32, 8, hz, mx);
2237cabdff1aSopenharmony_ciBI_MC(qpel, h, 48, 8, hz, mx);
2238cabdff1aSopenharmony_ciBI_MC(qpel, h, 64, 8, hz, mx);
2239cabdff1aSopenharmony_ci
2240cabdff1aSopenharmony_ciBI_MC(qpel, v, 8, 8, vt, my);
2241cabdff1aSopenharmony_ciBI_MC(qpel, v, 16, 8, vt, my);
2242cabdff1aSopenharmony_ciBI_MC(qpel, v, 24, 8, vt, my);
2243cabdff1aSopenharmony_ciBI_MC(qpel, v, 32, 8, vt, my);
2244cabdff1aSopenharmony_ciBI_MC(qpel, v, 48, 8, vt, my);
2245cabdff1aSopenharmony_ciBI_MC(qpel, v, 64, 8, vt, my);
2246cabdff1aSopenharmony_ci
2247cabdff1aSopenharmony_ciBI_MC(epel, h, 24, 4, hz, mx);
2248cabdff1aSopenharmony_ciBI_MC(epel, h, 32, 4, hz, mx);
2249cabdff1aSopenharmony_ci
2250cabdff1aSopenharmony_ciBI_MC(epel, v, 12, 4, vt, my);
2251cabdff1aSopenharmony_ciBI_MC(epel, v, 16, 4, vt, my);
2252cabdff1aSopenharmony_ciBI_MC(epel, v, 24, 4, vt, my);
2253cabdff1aSopenharmony_ciBI_MC(epel, v, 32, 4, vt, my);
2254cabdff1aSopenharmony_ci
2255cabdff1aSopenharmony_ci#undef BI_MC
2256cabdff1aSopenharmony_ci
2257cabdff1aSopenharmony_ci#define BI_MC_HV(PEL, WIDTH, TAP)                                         \
2258cabdff1aSopenharmony_civoid ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst,          \
2259cabdff1aSopenharmony_ci                                                   ptrdiff_t dst_stride,  \
2260cabdff1aSopenharmony_ci                                                   uint8_t *src,          \
2261cabdff1aSopenharmony_ci                                                   ptrdiff_t src_stride,  \
2262cabdff1aSopenharmony_ci                                                   int16_t *src_16bit,    \
2263cabdff1aSopenharmony_ci                                                   int height,            \
2264cabdff1aSopenharmony_ci                                                   intptr_t mx,           \
2265cabdff1aSopenharmony_ci                                                   intptr_t my,           \
2266cabdff1aSopenharmony_ci                                                   int width)             \
2267cabdff1aSopenharmony_ci{                                                                         \
2268cabdff1aSopenharmony_ci    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];             \
2269cabdff1aSopenharmony_ci    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];             \
2270cabdff1aSopenharmony_ci                                                                          \
2271cabdff1aSopenharmony_ci    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit,           \
2272cabdff1aSopenharmony_ci                                    MAX_PB_SIZE, dst, dst_stride,         \
2273cabdff1aSopenharmony_ci                                    filter_x, filter_y, height);          \
2274cabdff1aSopenharmony_ci}
2275cabdff1aSopenharmony_ci
2276cabdff1aSopenharmony_ciBI_MC_HV(qpel, 8, 8);
2277cabdff1aSopenharmony_ciBI_MC_HV(qpel, 16, 8);
2278cabdff1aSopenharmony_ciBI_MC_HV(qpel, 24, 8);
2279cabdff1aSopenharmony_ciBI_MC_HV(qpel, 32, 8);
2280cabdff1aSopenharmony_ciBI_MC_HV(qpel, 48, 8);
2281cabdff1aSopenharmony_ciBI_MC_HV(qpel, 64, 8);
2282cabdff1aSopenharmony_ci
2283cabdff1aSopenharmony_ciBI_MC_HV(epel, 8, 4);
2284cabdff1aSopenharmony_ciBI_MC_HV(epel, 6, 4);
2285cabdff1aSopenharmony_ciBI_MC_HV(epel, 16, 4);
2286cabdff1aSopenharmony_ciBI_MC_HV(epel, 24, 4);
2287cabdff1aSopenharmony_ciBI_MC_HV(epel, 32, 4);
2288cabdff1aSopenharmony_ci
2289cabdff1aSopenharmony_ci#undef BI_MC_HV
2290