1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2022 Loongson Technology Corporation Limited
3cabdff1aSopenharmony_ci * Contributed by Lu Wang <wanglu@loongson.cn>
4cabdff1aSopenharmony_ci *                Hao Chen <chenhao@loongson.cn>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
24cabdff1aSopenharmony_ci#include "hevcdsp_lsx.h"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27cabdff1aSopenharmony_ci    /* 8 width cases */
28cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29cabdff1aSopenharmony_ci    /* 4 width cases */
30cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31cabdff1aSopenharmony_ci};
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci/* hevc_copy: dst = src << 6 */
34cabdff1aSopenharmony_cistatic void hevc_copy_4w_lsx(uint8_t *src, int32_t src_stride,
35cabdff1aSopenharmony_ci                             int16_t *dst, int32_t dst_stride,
36cabdff1aSopenharmony_ci                             int32_t height)
37cabdff1aSopenharmony_ci{
38cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
39cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
40cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
41cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
42cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
43cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
44cabdff1aSopenharmony_ci    int32_t loop_cnt = height >> 3;
45cabdff1aSopenharmony_ci    int32_t res = height & 0x07;
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
48cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3;
49cabdff1aSopenharmony_ci    for (; loop_cnt--;) {
50cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
51cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
52cabdff1aSopenharmony_ci                  src1, src2);
53cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
54cabdff1aSopenharmony_ci        src += src_stride_4x;
55cabdff1aSopenharmony_ci        src4 = __lsx_vld(src, 0);
56cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
57cabdff1aSopenharmony_ci                  src5, src6);
58cabdff1aSopenharmony_ci        src7 = __lsx_vldx(src, src_stride_3x);
59cabdff1aSopenharmony_ci        src += src_stride_4x;
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src5, src4, src7, src6,
62cabdff1aSopenharmony_ci                  src0, src1, src2, src3);
63cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
64cabdff1aSopenharmony_ci                  in0, in1, in2, in3);
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst, 0, 0);
67cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst + dst_stride, 0, 1);
68cabdff1aSopenharmony_ci        __lsx_vstelm_d(in1, dst + dst_stride_2x, 0, 0);
69cabdff1aSopenharmony_ci        __lsx_vstelm_d(in1, dst + dst_stride_3x, 0, 1);
70cabdff1aSopenharmony_ci        dst += dst_stride_4x;
71cabdff1aSopenharmony_ci        __lsx_vstelm_d(in2, dst, 0, 0);
72cabdff1aSopenharmony_ci        __lsx_vstelm_d(in2, dst + dst_stride, 0, 1);
73cabdff1aSopenharmony_ci        __lsx_vstelm_d(in3, dst + dst_stride_2x, 0, 0);
74cabdff1aSopenharmony_ci        __lsx_vstelm_d(in3, dst + dst_stride_3x, 0, 1);
75cabdff1aSopenharmony_ci        dst += dst_stride_4x;
76cabdff1aSopenharmony_ci    }
77cabdff1aSopenharmony_ci    for (;res--;) {
78cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
79cabdff1aSopenharmony_ci        in0 = __lsx_vsllwil_hu_bu(src0, 6);
80cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst, 0, 0);
81cabdff1aSopenharmony_ci        src += src_stride;
82cabdff1aSopenharmony_ci        dst += dst_stride;
83cabdff1aSopenharmony_ci    }
84cabdff1aSopenharmony_ci}
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_cistatic void hevc_copy_6w_lsx(uint8_t *src, int32_t src_stride,
87cabdff1aSopenharmony_ci                             int16_t *dst, int32_t dst_stride,
88cabdff1aSopenharmony_ci                             int32_t height)
89cabdff1aSopenharmony_ci{
90cabdff1aSopenharmony_ci    int32_t loop_cnt = (height >> 3);
91cabdff1aSopenharmony_ci    int32_t res = height & 0x07;
92cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
93cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
94cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
95cabdff1aSopenharmony_ci
96cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
97cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
98cabdff1aSopenharmony_ci
99cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
100cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
101cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
102cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
103cabdff1aSopenharmony_ci        src += src_stride_4x;
104cabdff1aSopenharmony_ci        src4 = __lsx_vld(src, 0);
105cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
106cabdff1aSopenharmony_ci        src7 = __lsx_vldx(src, src_stride_3x);
107cabdff1aSopenharmony_ci        src += src_stride_4x;
108cabdff1aSopenharmony_ci
109cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
110cabdff1aSopenharmony_ci                  in0, in1, in2, in3);
111cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
112cabdff1aSopenharmony_ci                  in4, in5, in6, in7);
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst, 0, 0);
115cabdff1aSopenharmony_ci        __lsx_vstelm_w(in0, dst, 8, 2);
116cabdff1aSopenharmony_ci        dst += dst_stride;
117cabdff1aSopenharmony_ci        __lsx_vstelm_d(in1, dst, 0, 0);
118cabdff1aSopenharmony_ci        __lsx_vstelm_w(in1, dst, 8, 2);
119cabdff1aSopenharmony_ci        dst += dst_stride;
120cabdff1aSopenharmony_ci        __lsx_vstelm_d(in2, dst, 0, 0);
121cabdff1aSopenharmony_ci        __lsx_vstelm_w(in2, dst, 8, 2);
122cabdff1aSopenharmony_ci        dst += dst_stride;
123cabdff1aSopenharmony_ci        __lsx_vstelm_d(in3, dst, 0, 0);
124cabdff1aSopenharmony_ci        __lsx_vstelm_w(in3, dst, 8, 2);
125cabdff1aSopenharmony_ci        dst += dst_stride;
126cabdff1aSopenharmony_ci        __lsx_vstelm_d(in4, dst, 0, 0);
127cabdff1aSopenharmony_ci        __lsx_vstelm_w(in4, dst, 8, 2);
128cabdff1aSopenharmony_ci        dst += dst_stride;
129cabdff1aSopenharmony_ci        __lsx_vstelm_d(in5, dst, 0, 0);
130cabdff1aSopenharmony_ci        __lsx_vstelm_w(in5, dst, 8, 2);
131cabdff1aSopenharmony_ci        dst += dst_stride;
132cabdff1aSopenharmony_ci        __lsx_vstelm_d(in6, dst, 0, 0);
133cabdff1aSopenharmony_ci        __lsx_vstelm_w(in6, dst, 8, 2);
134cabdff1aSopenharmony_ci        dst += dst_stride;
135cabdff1aSopenharmony_ci        __lsx_vstelm_d(in7, dst, 0, 0);
136cabdff1aSopenharmony_ci        __lsx_vstelm_w(in7, dst, 8, 2);
137cabdff1aSopenharmony_ci        dst += dst_stride;
138cabdff1aSopenharmony_ci    }
139cabdff1aSopenharmony_ci    for (;res--;) {
140cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
141cabdff1aSopenharmony_ci        in0 = __lsx_vsllwil_hu_bu(src0, 6);
142cabdff1aSopenharmony_ci        src += src_stride;
143cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst, 0, 0);
144cabdff1aSopenharmony_ci        __lsx_vstelm_w(in0, dst, 8, 2);
145cabdff1aSopenharmony_ci        dst += dst_stride;
146cabdff1aSopenharmony_ci    }
147cabdff1aSopenharmony_ci}
148cabdff1aSopenharmony_ci
149cabdff1aSopenharmony_cistatic void hevc_copy_8w_lsx(uint8_t *src, int32_t src_stride,
150cabdff1aSopenharmony_ci                             int16_t *dst, int32_t dst_stride,
151cabdff1aSopenharmony_ci                             int32_t height)
152cabdff1aSopenharmony_ci{
153cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
154cabdff1aSopenharmony_ci    int32_t dst_stride_x  = (dst_stride << 1);
155cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
156cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride_x << 1);
157cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
158cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
159cabdff1aSopenharmony_ci    int32_t loop_cnt = height >> 3;
160cabdff1aSopenharmony_ci    int32_t res = height & 0x07;
161cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
162cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
163cabdff1aSopenharmony_ci
164cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
165cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
166cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
167cabdff1aSopenharmony_ci                  src1, src2);
168cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
169cabdff1aSopenharmony_ci        src += src_stride_4x;
170cabdff1aSopenharmony_ci        src4 = __lsx_vld(src, 0);
171cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
172cabdff1aSopenharmony_ci                  src5, src6);
173cabdff1aSopenharmony_ci        src7 = __lsx_vldx(src, src_stride_3x);
174cabdff1aSopenharmony_ci        src += src_stride_4x;
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
177cabdff1aSopenharmony_ci                  in0, in1, in2, in3);
178cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
179cabdff1aSopenharmony_ci                  in4, in5, in6, in7);
180cabdff1aSopenharmony_ci        __lsx_vst(in0, dst, 0);
181cabdff1aSopenharmony_ci        __lsx_vstx(in1, dst, dst_stride_x);
182cabdff1aSopenharmony_ci        __lsx_vstx(in2, dst, dst_stride_2x);
183cabdff1aSopenharmony_ci        __lsx_vstx(in3, dst, dst_stride_3x);
184cabdff1aSopenharmony_ci        dst += dst_stride_2x;
185cabdff1aSopenharmony_ci        __lsx_vst(in4, dst, 0);
186cabdff1aSopenharmony_ci        __lsx_vstx(in5, dst, dst_stride_x);
187cabdff1aSopenharmony_ci        __lsx_vstx(in6, dst, dst_stride_2x);
188cabdff1aSopenharmony_ci        __lsx_vstx(in7, dst, dst_stride_3x);
189cabdff1aSopenharmony_ci        dst += dst_stride_2x;
190cabdff1aSopenharmony_ci    }
191cabdff1aSopenharmony_ci    for (;res--;) {
192cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
193cabdff1aSopenharmony_ci        in0 = __lsx_vsllwil_hu_bu(src0, 6);
194cabdff1aSopenharmony_ci        __lsx_vst(in0, dst, 0);
195cabdff1aSopenharmony_ci        src += src_stride;
196cabdff1aSopenharmony_ci        dst += dst_stride;
197cabdff1aSopenharmony_ci    }
198cabdff1aSopenharmony_ci}
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_cistatic void hevc_copy_12w_lsx(uint8_t *src, int32_t src_stride,
201cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
202cabdff1aSopenharmony_ci                              int32_t height)
203cabdff1aSopenharmony_ci{
204cabdff1aSopenharmony_ci    uint32_t loop_cnt;
205cabdff1aSopenharmony_ci    uint32_t res = height & 0x07;
206cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
207cabdff1aSopenharmony_ci    int32_t dst_stride_x  = (dst_stride << 1);
208cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
209cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride_x << 1);
210cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
211cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
212cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
213cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
214cabdff1aSopenharmony_ci    __m128i in0, in1, in0_r, in1_r, in2_r, in3_r;
215cabdff1aSopenharmony_ci
216cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
217cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
218cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
219cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
220cabdff1aSopenharmony_ci        src += src_stride_4x;
221cabdff1aSopenharmony_ci        src4 = __lsx_vld(src, 0);
222cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
223cabdff1aSopenharmony_ci        src7 = __lsx_vldx(src, src_stride_3x);
224cabdff1aSopenharmony_ci        src += src_stride_4x;
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
227cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
228cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
229cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1);
230cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
231cabdff1aSopenharmony_ci        __lsx_vstx(in1_r, dst, dst_stride_x);
232cabdff1aSopenharmony_ci        __lsx_vstx(in2_r, dst, dst_stride_2x);
233cabdff1aSopenharmony_ci        __lsx_vstx(in3_r, dst, dst_stride_3x);
234cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst, 16, 0);
235cabdff1aSopenharmony_ci        dst += dst_stride;
236cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst, 16, 1);
237cabdff1aSopenharmony_ci        dst += dst_stride;
238cabdff1aSopenharmony_ci        __lsx_vstelm_d(in1, dst, 16, 0);
239cabdff1aSopenharmony_ci        dst += dst_stride;
240cabdff1aSopenharmony_ci        __lsx_vstelm_d(in1, dst, 16, 1);
241cabdff1aSopenharmony_ci        dst += dst_stride;
242cabdff1aSopenharmony_ci
243cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
244cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
245cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_w, src5, src4, src7, src6, src0, src1);
246cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1);
247cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
248cabdff1aSopenharmony_ci        __lsx_vstx(in1_r, dst, dst_stride_x);
249cabdff1aSopenharmony_ci        __lsx_vstx(in2_r, dst, dst_stride_2x);
250cabdff1aSopenharmony_ci        __lsx_vstx(in3_r, dst, dst_stride_3x);
251cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst, 16, 0);
252cabdff1aSopenharmony_ci        dst += dst_stride;
253cabdff1aSopenharmony_ci        __lsx_vstelm_d(in0, dst, 16, 1);
254cabdff1aSopenharmony_ci        dst += dst_stride;
255cabdff1aSopenharmony_ci        __lsx_vstelm_d(in1, dst, 16, 0);
256cabdff1aSopenharmony_ci        dst += dst_stride;
257cabdff1aSopenharmony_ci        __lsx_vstelm_d(in1, dst, 16, 1);
258cabdff1aSopenharmony_ci        dst += dst_stride;
259cabdff1aSopenharmony_ci    }
260cabdff1aSopenharmony_ci    for (;res--;) {
261cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
262cabdff1aSopenharmony_ci        in0  = __lsx_vsllwil_hu_bu(src0, 6);
263cabdff1aSopenharmony_ci        src1 = __lsx_vilvh_b(zero, src0);
264cabdff1aSopenharmony_ci        in1  = __lsx_vslli_h(src1, 6);
265cabdff1aSopenharmony_ci        __lsx_vst(in0, dst, 0);
266cabdff1aSopenharmony_ci        __lsx_vstelm_d(in1, dst, 16, 0);
267cabdff1aSopenharmony_ci        src += src_stride;
268cabdff1aSopenharmony_ci        dst += dst_stride;
269cabdff1aSopenharmony_ci    }
270cabdff1aSopenharmony_ci}
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_cistatic void hevc_copy_16w_lsx(uint8_t *src, int32_t src_stride,
273cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
274cabdff1aSopenharmony_ci                              int32_t height)
275cabdff1aSopenharmony_ci{
276cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
277cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
278cabdff1aSopenharmony_ci    int32_t dst_stride_x  = (dst_stride << 1);
279cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
280cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 2);
281cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
282cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
283cabdff1aSopenharmony_ci    int32_t loop_cnt = height >> 3;
284cabdff1aSopenharmony_ci    int32_t res = height & 0x07;
285cabdff1aSopenharmony_ci    int16_t* dst1 = dst + 8;
286cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
287cabdff1aSopenharmony_ci    __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
288cabdff1aSopenharmony_ci
289cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
290cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
291cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
292cabdff1aSopenharmony_ci                  src1, src2);
293cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
294cabdff1aSopenharmony_ci        src += src_stride_4x;
295cabdff1aSopenharmony_ci        src4 = __lsx_vld(src, 0);
296cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
297cabdff1aSopenharmony_ci                  src5, src6);
298cabdff1aSopenharmony_ci        src7 = __lsx_vldx(src, src_stride_3x);
299cabdff1aSopenharmony_ci        src += src_stride_4x;
300cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
301cabdff1aSopenharmony_ci                  in0_l, in1_l, in2_l, in3_l);
302cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
303cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
304cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
305cabdff1aSopenharmony_ci                  in1_l, in2_l, in3_l);
306cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
307cabdff1aSopenharmony_ci        __lsx_vstx(in1_r, dst, dst_stride_x);
308cabdff1aSopenharmony_ci        __lsx_vstx(in2_r, dst, dst_stride_2x);
309cabdff1aSopenharmony_ci        __lsx_vstx(in3_r, dst, dst_stride_3x);
310cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst1, 0);
311cabdff1aSopenharmony_ci        __lsx_vstx(in1_l, dst1, dst_stride_x);
312cabdff1aSopenharmony_ci        __lsx_vstx(in2_l, dst1, dst_stride_2x);
313cabdff1aSopenharmony_ci        __lsx_vstx(in3_l, dst1, dst_stride_3x);
314cabdff1aSopenharmony_ci        dst += dst_stride_2x;
315cabdff1aSopenharmony_ci        dst1 += dst_stride_2x;
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7,
318cabdff1aSopenharmony_ci                  in0_l, in1_l, in2_l, in3_l);
319cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
320cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
321cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
322cabdff1aSopenharmony_ci                  in1_l, in2_l, in3_l);
323cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
324cabdff1aSopenharmony_ci        __lsx_vstx(in1_r, dst, dst_stride_x);
325cabdff1aSopenharmony_ci        __lsx_vstx(in2_r, dst, dst_stride_2x);
326cabdff1aSopenharmony_ci        __lsx_vstx(in3_r, dst, dst_stride_3x);
327cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst1, 0);
328cabdff1aSopenharmony_ci        __lsx_vstx(in1_l, dst1, dst_stride_x);
329cabdff1aSopenharmony_ci        __lsx_vstx(in2_l, dst1, dst_stride_2x);
330cabdff1aSopenharmony_ci        __lsx_vstx(in3_l, dst1, dst_stride_3x);
331cabdff1aSopenharmony_ci        dst += dst_stride_2x;
332cabdff1aSopenharmony_ci        dst1 += dst_stride_2x;
333cabdff1aSopenharmony_ci    }
334cabdff1aSopenharmony_ci    if (res) {
335cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
336cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
337cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
338cabdff1aSopenharmony_ci
339cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
340cabdff1aSopenharmony_ci                  in0_l, in1_l, in2_l, in3_l);
341cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
342cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
343cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
344cabdff1aSopenharmony_ci                  in1_l, in2_l, in3_l);
345cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
346cabdff1aSopenharmony_ci        __lsx_vstx(in1_r, dst, dst_stride_x);
347cabdff1aSopenharmony_ci        __lsx_vstx(in2_r, dst, dst_stride_2x);
348cabdff1aSopenharmony_ci        __lsx_vstx(in3_r, dst, dst_stride_3x);
349cabdff1aSopenharmony_ci        dst += 8;
350cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst, 0);
351cabdff1aSopenharmony_ci        __lsx_vstx(in1_l, dst, dst_stride_x);
352cabdff1aSopenharmony_ci        __lsx_vstx(in2_l, dst, dst_stride_2x);
353cabdff1aSopenharmony_ci        __lsx_vstx(in3_l, dst, dst_stride_3x);
354cabdff1aSopenharmony_ci    }
355cabdff1aSopenharmony_ci}
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_cistatic void hevc_copy_24w_lsx(uint8_t *src, int32_t src_stride,
358cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
359cabdff1aSopenharmony_ci                              int32_t height)
360cabdff1aSopenharmony_ci{
361cabdff1aSopenharmony_ci    uint32_t loop_cnt;
362cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
363cabdff1aSopenharmony_ci    int32_t dst_stride_x = (dst_stride << 1);
364cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
365cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 2);
366cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
367cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
368cabdff1aSopenharmony_ci    uint8_t *_src = src + 16;
369cabdff1aSopenharmony_ci    int16_t *dst1 = dst;
370cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
371cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
372cabdff1aSopenharmony_ci    __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
375cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
376cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
377cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
378cabdff1aSopenharmony_ci        src += src_stride_4x;
379cabdff1aSopenharmony_ci        src4 = __lsx_vld(_src, 0);
380cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
381cabdff1aSopenharmony_ci                  src5, src6);
382cabdff1aSopenharmony_ci        src7 = __lsx_vldx(_src, src_stride_3x);
383cabdff1aSopenharmony_ci        _src += src_stride_4x;
384cabdff1aSopenharmony_ci
385cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
386cabdff1aSopenharmony_ci                  src3, in0_l, in1_l, in2_l, in3_l);
387cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
388cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
389cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
390cabdff1aSopenharmony_ci                  in0_l, in1_l, in2_l, in3_l);
391cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
392cabdff1aSopenharmony_ci        __lsx_vstx(in1_r, dst, dst_stride_x);
393cabdff1aSopenharmony_ci        __lsx_vstx(in2_r, dst, dst_stride_2x);
394cabdff1aSopenharmony_ci        __lsx_vstx(in3_r, dst, dst_stride_3x);
395cabdff1aSopenharmony_ci        dst1 = dst + 8;
396cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst1, 0);
397cabdff1aSopenharmony_ci        __lsx_vstx(in1_l, dst1, dst_stride_x);
398cabdff1aSopenharmony_ci        __lsx_vstx(in2_l, dst1, dst_stride_2x);
399cabdff1aSopenharmony_ci        __lsx_vstx(in3_l, dst1, dst_stride_3x);
400cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
401cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
402cabdff1aSopenharmony_ci        dst1 = dst1 + 8;
403cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst1, 0);
404cabdff1aSopenharmony_ci        __lsx_vstx(in1_r, dst1, dst_stride_x);
405cabdff1aSopenharmony_ci        __lsx_vstx(in2_r, dst1, dst_stride_2x);
406cabdff1aSopenharmony_ci        __lsx_vstx(in3_r, dst1, dst_stride_3x);
407cabdff1aSopenharmony_ci        dst += dst_stride_2x;
408cabdff1aSopenharmony_ci    }
409cabdff1aSopenharmony_ci}
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_cistatic void hevc_copy_32w_lsx(uint8_t *src, int32_t src_stride,
412cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
413cabdff1aSopenharmony_ci                              int32_t height)
414cabdff1aSopenharmony_ci{
415cabdff1aSopenharmony_ci    uint32_t loop_cnt;
416cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
417cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
418cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
419cabdff1aSopenharmony_ci    uint8_t *_src = src + 16;
420cabdff1aSopenharmony_ci    __m128i zero = {0};
421cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
422cabdff1aSopenharmony_ci    __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
425cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
426cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src2, src4);
427cabdff1aSopenharmony_ci        src6 = __lsx_vldx(src, src_stride_3x);
428cabdff1aSopenharmony_ci        src += src_stride_4x;
429cabdff1aSopenharmony_ci        src1 = __lsx_vld(_src, 0);
430cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
431cabdff1aSopenharmony_ci                  src3, src5);
432cabdff1aSopenharmony_ci        src7 = __lsx_vldx(_src, src_stride_3x);
433cabdff1aSopenharmony_ci        _src += src_stride_4x;
434cabdff1aSopenharmony_ci
435cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
436cabdff1aSopenharmony_ci                  src3, in0_l, in1_l, in2_l, in3_l);
437cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
438cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
439cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
440cabdff1aSopenharmony_ci                  in0_l, in1_l, in2_l, in3_l);
441cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
442cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst, 16);
443cabdff1aSopenharmony_ci        __lsx_vst(in1_r, dst, 32);
444cabdff1aSopenharmony_ci        __lsx_vst(in1_l, dst, 48);
445cabdff1aSopenharmony_ci        dst += dst_stride;
446cabdff1aSopenharmony_ci        __lsx_vst(in2_r, dst, 0);
447cabdff1aSopenharmony_ci        __lsx_vst(in2_l, dst, 16);
448cabdff1aSopenharmony_ci        __lsx_vst(in3_r, dst, 32);
449cabdff1aSopenharmony_ci        __lsx_vst(in3_l, dst, 48);
450cabdff1aSopenharmony_ci        dst += dst_stride;
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7,
453cabdff1aSopenharmony_ci                  in0_l, in1_l, in2_l, in3_l);
454cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
455cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
456cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
457cabdff1aSopenharmony_ci                  in1_l, in2_l, in3_l);
458cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
459cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst, 16);
460cabdff1aSopenharmony_ci        __lsx_vst(in1_r, dst, 32);
461cabdff1aSopenharmony_ci        __lsx_vst(in1_l, dst, 48);
462cabdff1aSopenharmony_ci        dst += dst_stride;
463cabdff1aSopenharmony_ci        __lsx_vst(in2_r, dst, 0);
464cabdff1aSopenharmony_ci        __lsx_vst(in2_l, dst, 16);
465cabdff1aSopenharmony_ci        __lsx_vst(in3_r, dst, 32);
466cabdff1aSopenharmony_ci        __lsx_vst(in3_l, dst, 48);
467cabdff1aSopenharmony_ci        dst += dst_stride;
468cabdff1aSopenharmony_ci    }
469cabdff1aSopenharmony_ci}
470cabdff1aSopenharmony_ci
471cabdff1aSopenharmony_cistatic void hevc_copy_48w_lsx(uint8_t *src, int32_t src_stride,
472cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
473cabdff1aSopenharmony_ci                              int32_t height)
474cabdff1aSopenharmony_ci{
475cabdff1aSopenharmony_ci    uint32_t loop_cnt;
476cabdff1aSopenharmony_ci    __m128i zero = {0};
477cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
478cabdff1aSopenharmony_ci    __m128i src8, src9, src10, src11;
479cabdff1aSopenharmony_ci    __m128i in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
480cabdff1aSopenharmony_ci    __m128i in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
483cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
484cabdff1aSopenharmony_ci        src2 = __lsx_vld(src, 32);
485cabdff1aSopenharmony_ci        src += src_stride;
486cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src3, src4);
487cabdff1aSopenharmony_ci        src5 = __lsx_vld(src, 32);
488cabdff1aSopenharmony_ci        src += src_stride;
489cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src6, src7);
490cabdff1aSopenharmony_ci        src8 = __lsx_vld(src, 32);
491cabdff1aSopenharmony_ci        src += src_stride;
492cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src9, src10);
493cabdff1aSopenharmony_ci        src11 = __lsx_vld(src, 32);
494cabdff1aSopenharmony_ci        src += src_stride;
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
497cabdff1aSopenharmony_ci                  src3, in0_l, in1_l, in2_l, in3_l);
498cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, in4_l, in5_l);
499cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
500cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
501cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
502cabdff1aSopenharmony_ci                  in1_l, in2_l, in3_l);
503cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, in4_r, in5_r);
504cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
505cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
506cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst, 16);
507cabdff1aSopenharmony_ci        __lsx_vst(in1_r, dst, 32);
508cabdff1aSopenharmony_ci        __lsx_vst(in1_l, dst, 48);
509cabdff1aSopenharmony_ci        __lsx_vst(in2_r, dst, 64);
510cabdff1aSopenharmony_ci        __lsx_vst(in2_l, dst, 80);
511cabdff1aSopenharmony_ci        dst += dst_stride;
512cabdff1aSopenharmony_ci        __lsx_vst(in3_r, dst, 0);
513cabdff1aSopenharmony_ci        __lsx_vst(in3_l, dst, 16);
514cabdff1aSopenharmony_ci        __lsx_vst(in4_r, dst, 32);
515cabdff1aSopenharmony_ci        __lsx_vst(in4_l, dst, 48);
516cabdff1aSopenharmony_ci        __lsx_vst(in5_r, dst, 64);
517cabdff1aSopenharmony_ci        __lsx_vst(in5_l, dst, 80);
518cabdff1aSopenharmony_ci        dst += dst_stride;
519cabdff1aSopenharmony_ci
520cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src6, zero, src7, zero, src8, zero, src9,
521cabdff1aSopenharmony_ci                  in0_l, in1_l, in2_l, in3_l);
522cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, zero, src10, zero, src11, in4_l, in5_l);
523cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src6, 6, src7, 6, src8, 6, src9, 6,
524cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
525cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
526cabdff1aSopenharmony_ci                  in1_l, in2_l, in3_l);
527cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vsllwil_hu_bu, src10, 6, src11, 6, in4_r, in5_r);
528cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
529cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
530cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst, 16);
531cabdff1aSopenharmony_ci        __lsx_vst(in1_r, dst, 32);
532cabdff1aSopenharmony_ci        __lsx_vst(in1_l, dst, 48);
533cabdff1aSopenharmony_ci        __lsx_vst(in2_r, dst, 64);
534cabdff1aSopenharmony_ci        __lsx_vst(in2_l, dst, 80);
535cabdff1aSopenharmony_ci        dst += dst_stride;
536cabdff1aSopenharmony_ci        __lsx_vst(in3_r, dst, 0);
537cabdff1aSopenharmony_ci        __lsx_vst(in3_l, dst, 16);
538cabdff1aSopenharmony_ci        __lsx_vst(in4_r, dst, 32);
539cabdff1aSopenharmony_ci        __lsx_vst(in4_l, dst, 48);
540cabdff1aSopenharmony_ci        __lsx_vst(in5_r, dst, 64);
541cabdff1aSopenharmony_ci        __lsx_vst(in5_l, dst, 80);
542cabdff1aSopenharmony_ci        dst += dst_stride;
543cabdff1aSopenharmony_ci    }
544cabdff1aSopenharmony_ci}
545cabdff1aSopenharmony_ci
546cabdff1aSopenharmony_cistatic void hevc_copy_64w_lsx(uint8_t *src, int32_t src_stride,
547cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
548cabdff1aSopenharmony_ci                              int32_t height)
549cabdff1aSopenharmony_ci{
550cabdff1aSopenharmony_ci    uint32_t loop_cnt;
551cabdff1aSopenharmony_ci    __m128i zero = {0};
552cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
553cabdff1aSopenharmony_ci    __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
554cabdff1aSopenharmony_ci
555cabdff1aSopenharmony_ci
556cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
557cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
558cabdff1aSopenharmony_ci                  src0, src1, src2, src3);
559cabdff1aSopenharmony_ci        src += src_stride;
560cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
561cabdff1aSopenharmony_ci                  src4, src5, src6, src7);
562cabdff1aSopenharmony_ci        src += src_stride;
563cabdff1aSopenharmony_ci
564cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
565cabdff1aSopenharmony_ci                  src3, in0_l, in1_l, in2_l, in3_l);
566cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
567cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
568cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
569cabdff1aSopenharmony_ci                  in0_l, in1_l, in2_l, in3_l);
570cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
571cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst, 16);
572cabdff1aSopenharmony_ci        __lsx_vst(in1_r, dst, 32);
573cabdff1aSopenharmony_ci        __lsx_vst(in1_l, dst, 48);
574cabdff1aSopenharmony_ci        __lsx_vst(in2_r, dst, 64);
575cabdff1aSopenharmony_ci        __lsx_vst(in2_l, dst, 80);
576cabdff1aSopenharmony_ci        __lsx_vst(in3_r, dst, 96);
577cabdff1aSopenharmony_ci        __lsx_vst(in3_l, dst, 112);
578cabdff1aSopenharmony_ci        dst += dst_stride;
579cabdff1aSopenharmony_ci
580cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero,
581cabdff1aSopenharmony_ci                  src7, in0_l, in1_l, in2_l, in3_l);
582cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
583cabdff1aSopenharmony_ci                  in0_r, in1_r, in2_r, in3_r);
584cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
585cabdff1aSopenharmony_ci                  in1_l, in2_l, in3_l);
586cabdff1aSopenharmony_ci        __lsx_vst(in0_r, dst, 0);
587cabdff1aSopenharmony_ci        __lsx_vst(in0_l, dst, 16);
588cabdff1aSopenharmony_ci        __lsx_vst(in1_r, dst, 32);
589cabdff1aSopenharmony_ci        __lsx_vst(in1_l, dst, 48);
590cabdff1aSopenharmony_ci        __lsx_vst(in2_r, dst, 64);
591cabdff1aSopenharmony_ci        __lsx_vst(in2_l, dst, 80);
592cabdff1aSopenharmony_ci        __lsx_vst(in3_r, dst, 96);
593cabdff1aSopenharmony_ci        __lsx_vst(in3_l, dst, 112);
594cabdff1aSopenharmony_ci        dst += dst_stride;
595cabdff1aSopenharmony_ci    }
596cabdff1aSopenharmony_ci}
597cabdff1aSopenharmony_ci
598cabdff1aSopenharmony_cistatic void hevc_hz_8t_4w_lsx(uint8_t *src, int32_t src_stride,
599cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
600cabdff1aSopenharmony_ci                              const int8_t *filter, int32_t height)
601cabdff1aSopenharmony_ci{
602cabdff1aSopenharmony_ci    uint32_t loop_cnt = height >> 3;
603cabdff1aSopenharmony_ci    uint32_t res = (height & 0x7) >> 1;
604cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
605cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
606cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
607cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
608cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
609cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
610cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
611cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
612cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
613cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
614cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
615cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 16);
616cabdff1aSopenharmony_ci
617cabdff1aSopenharmony_ci    src -= 3;
618cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
619cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
620cabdff1aSopenharmony_ci
621cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
622cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
623cabdff1aSopenharmony_ci
624cabdff1aSopenharmony_ci    for (;loop_cnt--;) {
625cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
626cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
627cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
628cabdff1aSopenharmony_ci        src += src_stride_4x;
629cabdff1aSopenharmony_ci        src4 = __lsx_vld(src, 0);
630cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
631cabdff1aSopenharmony_ci        src7 = __lsx_vldx(src, src_stride_3x);
632cabdff1aSopenharmony_ci        src += src_stride_4x;
633cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1,
634cabdff1aSopenharmony_ci                  src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3);
635cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
636cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
637cabdff1aSopenharmony_ci                  dst0, dst0);
638cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
639cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask0, src3, src2, mask1, src3,
640cabdff1aSopenharmony_ci                  src2, mask2, src3, src2, mask3, vec0, vec1, vec2, vec3);
641cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
642cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
643cabdff1aSopenharmony_ci                  dst1, dst1);
644cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
645cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src5, src4, mask0, src5, src4, mask1, src5,
646cabdff1aSopenharmony_ci                  src4, mask2, src5, src4, mask3, vec0, vec1, vec2, vec3);
647cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
648cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
649cabdff1aSopenharmony_ci                  dst2, dst2);
650cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
651cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src7, src6, mask0, src7, src6, mask1, src7,
652cabdff1aSopenharmony_ci                  src6, mask2, src7, src6, mask3, vec0, vec1, vec2, vec3);
653cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
654cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
655cabdff1aSopenharmony_ci                  dst3, dst3);
656cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
657cabdff1aSopenharmony_ci
658cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0, dst, 0, 0);
659cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
660cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1, dst + dst_stride_2x, 0, 0);
661cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1, dst + dst_stride_3x, 0, 1);
662cabdff1aSopenharmony_ci        dst += dst_stride_4x;
663cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2, dst, 0, 0);
664cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2, dst + dst_stride, 0, 1);
665cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst3, dst + dst_stride_2x, 0, 0);
666cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst3, dst + dst_stride_3x, 0, 1);
667cabdff1aSopenharmony_ci        dst += dst_stride_4x;
668cabdff1aSopenharmony_ci    }
669cabdff1aSopenharmony_ci    for (;res--;) {
670cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
671cabdff1aSopenharmony_ci        src1 = __lsx_vldx(src, src_stride);
672cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1,
673cabdff1aSopenharmony_ci                  src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3);
674cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
675cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
676cabdff1aSopenharmony_ci                  dst0, dst0);
677cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
678cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0, dst, 0, 0);
679cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
680cabdff1aSopenharmony_ci        src += src_stride_2x;
681cabdff1aSopenharmony_ci        dst += dst_stride_2x;
682cabdff1aSopenharmony_ci    }
683cabdff1aSopenharmony_ci}
684cabdff1aSopenharmony_ci
685cabdff1aSopenharmony_cistatic void hevc_hz_8t_8w_lsx(uint8_t *src, int32_t src_stride,
686cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
687cabdff1aSopenharmony_ci                              const int8_t *filter, int32_t height)
688cabdff1aSopenharmony_ci{
689cabdff1aSopenharmony_ci    uint32_t loop_cnt;
690cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
691cabdff1aSopenharmony_ci    int32_t dst_stride_x  = (dst_stride << 1);
692cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
693cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 2);
694cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
695cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
696cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
697cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
698cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
699cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
700cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
701cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
702cabdff1aSopenharmony_ci
703cabdff1aSopenharmony_ci    src -= 3;
704cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
705cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
708cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
709cabdff1aSopenharmony_ci
710cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
711cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
712cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
713cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
714cabdff1aSopenharmony_ci        src += src_stride_4x;
715cabdff1aSopenharmony_ci
716cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
717cabdff1aSopenharmony_ci                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
718cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
719cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
720cabdff1aSopenharmony_ci                  dst0, dst0);
721cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
722cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
723cabdff1aSopenharmony_ci                  src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
724cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
725cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
726cabdff1aSopenharmony_ci                  dst1, dst1);
727cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
728cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
729cabdff1aSopenharmony_ci                  src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
730cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
731cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
732cabdff1aSopenharmony_ci                  dst2, dst2);
733cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
734cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
735cabdff1aSopenharmony_ci                  src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
736cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
737cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
738cabdff1aSopenharmony_ci                  dst3, dst3);
739cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
740cabdff1aSopenharmony_ci
741cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
742cabdff1aSopenharmony_ci        __lsx_vstx(dst1, dst, dst_stride_x);
743cabdff1aSopenharmony_ci        __lsx_vstx(dst2, dst, dst_stride_2x);
744cabdff1aSopenharmony_ci        __lsx_vstx(dst3, dst, dst_stride_3x);
745cabdff1aSopenharmony_ci        dst += dst_stride_2x;
746cabdff1aSopenharmony_ci    }
747cabdff1aSopenharmony_ci}
748cabdff1aSopenharmony_ci
749cabdff1aSopenharmony_cistatic void hevc_hz_8t_12w_lsx(uint8_t *src, int32_t src_stride,
750cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
751cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
752cabdff1aSopenharmony_ci{
753cabdff1aSopenharmony_ci    uint32_t loop_cnt;
754cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
755cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
756cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
757cabdff1aSopenharmony_ci    uint8_t *_src;
758cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
759cabdff1aSopenharmony_ci    __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
760cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5;
761cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
762cabdff1aSopenharmony_ci
763cabdff1aSopenharmony_ci    src -= 3;
764cabdff1aSopenharmony_ci    _src = src + 8;
765cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
766cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
767cabdff1aSopenharmony_ci
768cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
769cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
770cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
771cabdff1aSopenharmony_ci    mask4 = __lsx_vld(ff_hevc_mask_arr, 16);
772cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
773cabdff1aSopenharmony_ci    mask7 = __lsx_vaddi_bu(mask4, 6);
774cabdff1aSopenharmony_ci
775cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
776cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
777cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
778cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
779cabdff1aSopenharmony_ci        src4 = __lsx_vld(_src, 0);
780cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
781cabdff1aSopenharmony_ci                  src5, src6);
782cabdff1aSopenharmony_ci        src7 = __lsx_vldx(_src, src_stride_3x);
783cabdff1aSopenharmony_ci        src += src_stride_4x;
784cabdff1aSopenharmony_ci        _src += src_stride_4x;
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
787cabdff1aSopenharmony_ci                  vec0, vec1);
788cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
789cabdff1aSopenharmony_ci                  vec2, vec3);
790cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask4, src7, src6, mask4,
791cabdff1aSopenharmony_ci                  vec4, vec5);
792cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
793cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
794cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
795cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
796cabdff1aSopenharmony_ci                  vec0, vec1);
797cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
798cabdff1aSopenharmony_ci                  vec2, vec3);
799cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask5, src7, src6, mask5,
800cabdff1aSopenharmony_ci                  vec4, vec5);
801cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
802cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
803cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
804cabdff1aSopenharmony_ci                  dst4, dst5);
805cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
806cabdff1aSopenharmony_ci                  vec0, vec1);
807cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
808cabdff1aSopenharmony_ci                  vec2, vec3);
809cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask6, src7, src6, mask6,
810cabdff1aSopenharmony_ci                  vec4, vec5);
811cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
812cabdff1aSopenharmony_ci                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
813cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
814cabdff1aSopenharmony_ci                  dst4, dst5);
815cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
816cabdff1aSopenharmony_ci                  vec0, vec1);
817cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
818cabdff1aSopenharmony_ci                  vec2, vec3);
819cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask7, src7, src6, mask7,
820cabdff1aSopenharmony_ci                  vec4, vec5);
821cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
822cabdff1aSopenharmony_ci                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
823cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
824cabdff1aSopenharmony_ci                  dst4, dst5);
825cabdff1aSopenharmony_ci
826cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
827cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst4, dst, 16, 0);
828cabdff1aSopenharmony_ci        dst += dst_stride;
829cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 0);
830cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst4, dst, 16, 1);
831cabdff1aSopenharmony_ci        dst += dst_stride;
832cabdff1aSopenharmony_ci        __lsx_vst(dst2, dst, 0);
833cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst5, dst, 16, 0);
834cabdff1aSopenharmony_ci        dst += dst_stride;
835cabdff1aSopenharmony_ci        __lsx_vst(dst3, dst, 0);
836cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst5, dst, 16, 1);
837cabdff1aSopenharmony_ci        dst += dst_stride;
838cabdff1aSopenharmony_ci    }
839cabdff1aSopenharmony_ci}
840cabdff1aSopenharmony_ci
841cabdff1aSopenharmony_cistatic void hevc_hz_8t_16w_lsx(uint8_t *src, int32_t src_stride,
842cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
843cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
844cabdff1aSopenharmony_ci{
845cabdff1aSopenharmony_ci    uint32_t loop_cnt;
846cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
847cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
848cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
849cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
850cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
851cabdff1aSopenharmony_ci    __m128i mask0;
852cabdff1aSopenharmony_ci
853cabdff1aSopenharmony_ci    src -= 3;
854cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
855cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
856cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
857cabdff1aSopenharmony_ci
858cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
859cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
860cabdff1aSopenharmony_ci
861cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
862cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
863cabdff1aSopenharmony_ci        src += src_stride;
864cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
865cabdff1aSopenharmony_ci        src += src_stride;
866cabdff1aSopenharmony_ci
867cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
868cabdff1aSopenharmony_ci                  vec0, vec1);
869cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
870cabdff1aSopenharmony_ci                  vec2, vec3);
871cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
872cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
873cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
874cabdff1aSopenharmony_ci                  vec0, vec1);
875cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
876cabdff1aSopenharmony_ci                  vec2, vec3);
877cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
878cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
879cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
880cabdff1aSopenharmony_ci                  vec0, vec1);
881cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
882cabdff1aSopenharmony_ci                  vec2, vec3);
883cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
884cabdff1aSopenharmony_ci                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
885cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
886cabdff1aSopenharmony_ci                  vec0, vec1);
887cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
888cabdff1aSopenharmony_ci                  vec2, vec3);
889cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
890cabdff1aSopenharmony_ci                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
891cabdff1aSopenharmony_ci
892cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
893cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 16);
894cabdff1aSopenharmony_ci        dst += dst_stride;
895cabdff1aSopenharmony_ci        __lsx_vst(dst2, dst, 0);
896cabdff1aSopenharmony_ci        __lsx_vst(dst3, dst, 16);
897cabdff1aSopenharmony_ci        dst += dst_stride;
898cabdff1aSopenharmony_ci    }
899cabdff1aSopenharmony_ci}
900cabdff1aSopenharmony_ci
901cabdff1aSopenharmony_cistatic void hevc_hz_8t_24w_lsx(uint8_t *src, int32_t src_stride,
902cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
903cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
904cabdff1aSopenharmony_ci{
905cabdff1aSopenharmony_ci    uint32_t loop_cnt;
906cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
907cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
908cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
909cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5;
910cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
911cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
912cabdff1aSopenharmony_ci
913cabdff1aSopenharmony_ci    src -= 3;
914cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
915cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
916cabdff1aSopenharmony_ci
917cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
918cabdff1aSopenharmony_ci              mask2, mask3, mask4);
919cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
920cabdff1aSopenharmony_ci    mask7 = __lsx_vaddi_bu(mask0, 14);
921cabdff1aSopenharmony_ci
922cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
923cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
924cabdff1aSopenharmony_ci        src += src_stride;
925cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src2, src3);
926cabdff1aSopenharmony_ci        src += src_stride;
927cabdff1aSopenharmony_ci
928cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
929cabdff1aSopenharmony_ci                  src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
930cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src3, mask0,
931cabdff1aSopenharmony_ci                  vec4, vec5);
932cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
933cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
934cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
935cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1,
936cabdff1aSopenharmony_ci                  src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
937cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask5, src3, src3, mask1,
938cabdff1aSopenharmony_ci                  vec4, vec5);
939cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
940cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
941cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
942cabdff1aSopenharmony_ci                  dst4, dst5);
943cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1,
944cabdff1aSopenharmony_ci                  src1, mask2, src2, src2, mask2, vec0, vec1, vec2, vec3);
945cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask6, src3, src3, mask2,
946cabdff1aSopenharmony_ci                  vec4, vec5);
947cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
948cabdff1aSopenharmony_ci                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
949cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
950cabdff1aSopenharmony_ci                  dst4, dst5);
951cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1,
952cabdff1aSopenharmony_ci                  src1, mask3, src2, src2, mask3, vec0, vec1, vec2, vec3);
953cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask7, src3, src3, mask3,
954cabdff1aSopenharmony_ci                  vec4, vec5);
955cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
956cabdff1aSopenharmony_ci                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
957cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
958cabdff1aSopenharmony_ci                  dst4, dst5);
959cabdff1aSopenharmony_ci
960cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
961cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 16);
962cabdff1aSopenharmony_ci        __lsx_vst(dst2, dst, 32);
963cabdff1aSopenharmony_ci        dst += dst_stride;
964cabdff1aSopenharmony_ci        __lsx_vst(dst3, dst, 0);
965cabdff1aSopenharmony_ci        __lsx_vst(dst4, dst, 16);
966cabdff1aSopenharmony_ci        __lsx_vst(dst5, dst, 32);
967cabdff1aSopenharmony_ci        dst += dst_stride;
968cabdff1aSopenharmony_ci    }
969cabdff1aSopenharmony_ci}
970cabdff1aSopenharmony_ci
971cabdff1aSopenharmony_cistatic void hevc_hz_8t_32w_lsx(uint8_t *src, int32_t src_stride,
972cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
973cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
974cabdff1aSopenharmony_ci{
975cabdff1aSopenharmony_ci    uint32_t loop_cnt;
976cabdff1aSopenharmony_ci    __m128i src0, src1, src2;
977cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
978cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
979cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
980cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
981cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
982cabdff1aSopenharmony_ci
983cabdff1aSopenharmony_ci    src -= 3;
984cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2,  filter, 4, filter, 6,
985cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
986cabdff1aSopenharmony_ci
987cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8,
988cabdff1aSopenharmony_ci              mask1, mask2, mask3, mask4);
989cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
990cabdff1aSopenharmony_ci    mask7 = __lsx_vaddi_bu(mask0, 14);
991cabdff1aSopenharmony_ci
992cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
993cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
994cabdff1aSopenharmony_ci        src2 = __lsx_vld(src, 24);
995cabdff1aSopenharmony_ci        src += src_stride;
996cabdff1aSopenharmony_ci
997cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
998cabdff1aSopenharmony_ci                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
999cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
1000cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
1001cabdff1aSopenharmony_ci                  dst0, dst0);
1002cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
1003cabdff1aSopenharmony_ci
1004cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1,
1005cabdff1aSopenharmony_ci                  src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3);
1006cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
1007cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
1008cabdff1aSopenharmony_ci                  dst1, dst1);
1009cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
1010cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1,
1011cabdff1aSopenharmony_ci                  mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
1012cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
1013cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
1014cabdff1aSopenharmony_ci                  dst2, dst2);
1015cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
1016cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2,
1017cabdff1aSopenharmony_ci                  mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
1018cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
1019cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
1020cabdff1aSopenharmony_ci                  dst3, dst3);
1021cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
1022cabdff1aSopenharmony_ci
1023cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
1024cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 16);
1025cabdff1aSopenharmony_ci        __lsx_vst(dst2, dst, 32);
1026cabdff1aSopenharmony_ci        __lsx_vst(dst3, dst, 48);
1027cabdff1aSopenharmony_ci        dst += dst_stride;
1028cabdff1aSopenharmony_ci    }
1029cabdff1aSopenharmony_ci}
1030cabdff1aSopenharmony_ci
1031cabdff1aSopenharmony_cistatic void hevc_hz_8t_48w_lsx(uint8_t *src, int32_t src_stride,
1032cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1033cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1034cabdff1aSopenharmony_ci{
1035cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1036cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
1037cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
1038cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1039cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5;
1040cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
1041cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1042cabdff1aSopenharmony_ci
1043cabdff1aSopenharmony_ci    src -= 3;
1044cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1045cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
1046cabdff1aSopenharmony_ci
1047cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
1048cabdff1aSopenharmony_ci              mask2, mask3, mask4);
1049cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
1050cabdff1aSopenharmony_ci    mask7 = __lsx_vaddi_bu(mask0, 14);
1051cabdff1aSopenharmony_ci
1052cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
1053cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
1054cabdff1aSopenharmony_ci        src2 = __lsx_vld(src, 32);
1055cabdff1aSopenharmony_ci        src3 = __lsx_vld(src, 40);
1056cabdff1aSopenharmony_ci        src += src_stride;
1057cabdff1aSopenharmony_ci
1058cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1, src1,
1059cabdff1aSopenharmony_ci                  mask0, src2, src1, mask4, vec0, vec1, vec2, vec3);
1060cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1061cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
1062cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1,
1063cabdff1aSopenharmony_ci                  src1, mask1, src2, src1, mask5, vec0, vec1, vec2, vec3);
1064cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1065cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1066cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1,
1067cabdff1aSopenharmony_ci                  src1, mask2, src2, src1, mask6, vec0, vec1, vec2, vec3);
1068cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
1069cabdff1aSopenharmony_ci                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
1070cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1,
1071cabdff1aSopenharmony_ci                  src1, mask3, src2, src1, mask7, vec0, vec1, vec2, vec3);
1072cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
1073cabdff1aSopenharmony_ci                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
1074cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
1075cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 16);
1076cabdff1aSopenharmony_ci        __lsx_vst(dst2, dst, 32);
1077cabdff1aSopenharmony_ci        __lsx_vst(dst3, dst, 48);
1078cabdff1aSopenharmony_ci
1079cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
1080cabdff1aSopenharmony_ci                  vec4, vec5);
1081cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
1082cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
1083cabdff1aSopenharmony_ci                  vec4, vec5);
1084cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
1085cabdff1aSopenharmony_ci                  dst4, dst5);
1086cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
1087cabdff1aSopenharmony_ci                  vec4, vec5);
1088cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
1089cabdff1aSopenharmony_ci                  dst4, dst5);
1090cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
1091cabdff1aSopenharmony_ci                  vec4, vec5);
1092cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
1093cabdff1aSopenharmony_ci                  dst4, dst5);
1094cabdff1aSopenharmony_ci        __lsx_vst(dst4, dst, 64);
1095cabdff1aSopenharmony_ci        __lsx_vst(dst5, dst, 80);
1096cabdff1aSopenharmony_ci        dst += dst_stride;
1097cabdff1aSopenharmony_ci    }
1098cabdff1aSopenharmony_ci}
1099cabdff1aSopenharmony_ci
1100cabdff1aSopenharmony_cistatic void hevc_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
1101cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1102cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1103cabdff1aSopenharmony_ci{
1104cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1105cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4;
1106cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
1107cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1108cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
1109cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1110cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1111cabdff1aSopenharmony_ci
1112cabdff1aSopenharmony_ci    src -= 3;
1113cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1114cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
1115cabdff1aSopenharmony_ci
1116cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
1117cabdff1aSopenharmony_ci              mask2, mask3, mask4);
1118cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6)
1119cabdff1aSopenharmony_ci    mask7 = __lsx_vaddi_bu(mask0, 14);
1120cabdff1aSopenharmony_ci
1121cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
1122cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vld, src, 0, src, 16,  src, 32, src, 48,
1123cabdff1aSopenharmony_ci                  src0, src1, src2, src3);
1124cabdff1aSopenharmony_ci        src4 = __lsx_vld(src, 56);
1125cabdff1aSopenharmony_ci        src += src_stride;
1126cabdff1aSopenharmony_ci
1127cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
1128cabdff1aSopenharmony_ci                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
1129cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
1130cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
1131cabdff1aSopenharmony_ci                  dst0, dst0);
1132cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
1133cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
1134cabdff1aSopenharmony_ci
1135cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1,
1136cabdff1aSopenharmony_ci                  src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3);
1137cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
1138cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
1139cabdff1aSopenharmony_ci                  dst1, dst1);
1140cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
1141cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 16);
1142cabdff1aSopenharmony_ci
1143cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
1144cabdff1aSopenharmony_ci                  src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
1145cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
1146cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
1147cabdff1aSopenharmony_ci                  dst2, dst2);
1148cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
1149cabdff1aSopenharmony_ci        __lsx_vst(dst2, dst, 32);
1150cabdff1aSopenharmony_ci
1151cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src2, src1, mask4, src2, src1, mask5, src2,
1152cabdff1aSopenharmony_ci                  src1, mask6, src2, src1, mask7, vec0, vec1, vec2, vec3);
1153cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
1154cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
1155cabdff1aSopenharmony_ci                  dst3, dst3);
1156cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
1157cabdff1aSopenharmony_ci        __lsx_vst(dst3, dst, 48);
1158cabdff1aSopenharmony_ci
1159cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
1160cabdff1aSopenharmony_ci                  src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
1161cabdff1aSopenharmony_ci        dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
1162cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
1163cabdff1aSopenharmony_ci                  dst4, dst4);
1164cabdff1aSopenharmony_ci        dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
1165cabdff1aSopenharmony_ci        __lsx_vst(dst4, dst, 64);
1166cabdff1aSopenharmony_ci
1167cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src2, mask5, src3,
1168cabdff1aSopenharmony_ci                  src2, mask6, src3, src2, mask7, vec0, vec1, vec2, vec3);
1169cabdff1aSopenharmony_ci        dst5 = __lsx_vdp2_h_bu_b(vec0, filt0);
1170cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec1, filt1, dst5, vec2, filt2,
1171cabdff1aSopenharmony_ci                  dst5, dst5);
1172cabdff1aSopenharmony_ci        dst5 = __lsx_vdp2add_h_bu_b(dst5, vec3, filt3);
1173cabdff1aSopenharmony_ci        __lsx_vst(dst5, dst, 80);
1174cabdff1aSopenharmony_ci
1175cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
1176cabdff1aSopenharmony_ci                  src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
1177cabdff1aSopenharmony_ci        dst6 = __lsx_vdp2_h_bu_b(vec0, filt0);
1178cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec1, filt1, dst6, vec2, filt2,
1179cabdff1aSopenharmony_ci                  dst6, dst6);
1180cabdff1aSopenharmony_ci        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec3, filt3);
1181cabdff1aSopenharmony_ci        __lsx_vst(dst6, dst, 96);
1182cabdff1aSopenharmony_ci
1183cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
1184cabdff1aSopenharmony_ci                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
1185cabdff1aSopenharmony_ci        dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
1186cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
1187cabdff1aSopenharmony_ci                  dst7, dst7);
1188cabdff1aSopenharmony_ci        dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
1189cabdff1aSopenharmony_ci        __lsx_vst(dst7, dst, 112);
1190cabdff1aSopenharmony_ci        dst += dst_stride;
1191cabdff1aSopenharmony_ci    }
1192cabdff1aSopenharmony_ci}
1193cabdff1aSopenharmony_ci
1194cabdff1aSopenharmony_cistatic void hevc_vt_8t_4w_lsx(uint8_t *src, int32_t src_stride,
1195cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
1196cabdff1aSopenharmony_ci                              const int8_t *filter, int32_t height)
1197cabdff1aSopenharmony_ci{
1198cabdff1aSopenharmony_ci    int32_t loop_cnt;
1199cabdff1aSopenharmony_ci    int32_t res = (height & 0x07) >> 1;
1200cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1201cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1202cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1203cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
1204cabdff1aSopenharmony_ci    __m128i src9, src10, src11, src12, src13, src14;
1205cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
1206cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
1207cabdff1aSopenharmony_ci    __m128i src1110_r, src1211_r, src1312_r, src1413_r;
1208cabdff1aSopenharmony_ci    __m128i src2110, src4332, src6554, src8776, src10998;
1209cabdff1aSopenharmony_ci    __m128i src12111110, src14131312;
1210cabdff1aSopenharmony_ci    __m128i dst10, dst32, dst54, dst76;
1211cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
1212cabdff1aSopenharmony_ci
1213cabdff1aSopenharmony_ci    src -= src_stride_3x;
1214cabdff1aSopenharmony_ci
1215cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1216cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
1217cabdff1aSopenharmony_ci
1218cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
1219cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
1220cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src, src_stride_3x);
1221cabdff1aSopenharmony_ci    src += src_stride_4x;
1222cabdff1aSopenharmony_ci    src4 = __lsx_vld(src, 0);
1223cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
1224cabdff1aSopenharmony_ci    src += src_stride_3x;
1225cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1226cabdff1aSopenharmony_ci              src10_r, src32_r, src54_r, src21_r);
1227cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
1228cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r,
1229cabdff1aSopenharmony_ci              src2110, src4332);
1230cabdff1aSopenharmony_ci    src6554 = __lsx_vilvl_d(src65_r, src54_r);
1231cabdff1aSopenharmony_ci
1232cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 3); loop_cnt--;) {
1233cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
1234cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
1235cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src, src_stride_3x);
1236cabdff1aSopenharmony_ci        src += src_stride_4x;
1237cabdff1aSopenharmony_ci        src11 = __lsx_vld(src, 0);
1238cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1239cabdff1aSopenharmony_ci                  src12, src13);
1240cabdff1aSopenharmony_ci        src14 = __lsx_vldx(src, src_stride_3x);
1241cabdff1aSopenharmony_ci        src += src_stride_4x;
1242cabdff1aSopenharmony_ci
1243cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
1244cabdff1aSopenharmony_ci                  src76_r, src87_r, src98_r, src109_r);
1245cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src11, src10, src12, src11, src13, src12, src14,
1246cabdff1aSopenharmony_ci                  src13, src1110_r, src1211_r, src1312_r, src1413_r);
1247cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_d, src87_r, src76_r, src109_r, src98_r, src1211_r,
1248cabdff1aSopenharmony_ci                  src1110_r, src1413_r, src1312_r, src8776, src10998,
1249cabdff1aSopenharmony_ci                  src12111110, src14131312);
1250cabdff1aSopenharmony_ci
1251cabdff1aSopenharmony_ci        dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
1252cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
1253cabdff1aSopenharmony_ci                  filt2, dst10, dst10);
1254cabdff1aSopenharmony_ci        dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
1255cabdff1aSopenharmony_ci        dst32 = __lsx_vdp2_h_bu_b(src4332, filt0);
1256cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst32, src6554, filt1, dst32, src8776,
1257cabdff1aSopenharmony_ci                  filt2, dst32, dst32);
1258cabdff1aSopenharmony_ci        dst32 = __lsx_vdp2add_h_bu_b(dst32, src10998, filt3);
1259cabdff1aSopenharmony_ci        dst54 = __lsx_vdp2_h_bu_b(src6554, filt0);
1260cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst54, src8776, filt1,
1261cabdff1aSopenharmony_ci                  dst54, src10998, filt2, dst54, dst54);
1262cabdff1aSopenharmony_ci        dst54 = __lsx_vdp2add_h_bu_b(dst54, src12111110, filt3);
1263cabdff1aSopenharmony_ci        dst76 = __lsx_vdp2_h_bu_b(src8776, filt0);
1264cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst76, src10998, filt1, dst76,
1265cabdff1aSopenharmony_ci                  src12111110, filt2, dst76, dst76);
1266cabdff1aSopenharmony_ci        dst76 = __lsx_vdp2add_h_bu_b(dst76, src14131312, filt3);
1267cabdff1aSopenharmony_ci
1268cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst10, dst, 0, 0);
1269cabdff1aSopenharmony_ci        dst += dst_stride;
1270cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst10, dst, 0, 1);
1271cabdff1aSopenharmony_ci        dst += dst_stride;
1272cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst32, dst, 0, 0);
1273cabdff1aSopenharmony_ci        dst += dst_stride;
1274cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst32, dst, 0, 1);
1275cabdff1aSopenharmony_ci        dst += dst_stride;
1276cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst54, dst, 0, 0);
1277cabdff1aSopenharmony_ci        dst += dst_stride;
1278cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst54, dst, 0, 1);
1279cabdff1aSopenharmony_ci        dst += dst_stride;
1280cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst76, dst, 0, 0);
1281cabdff1aSopenharmony_ci        dst += dst_stride;
1282cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst76, dst, 0, 1);
1283cabdff1aSopenharmony_ci        dst += dst_stride;
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci        src2110 = src10998;
1286cabdff1aSopenharmony_ci        src4332 = src12111110;
1287cabdff1aSopenharmony_ci        src6554 = src14131312;
1288cabdff1aSopenharmony_ci        src6 = src14;
1289cabdff1aSopenharmony_ci    }
1290cabdff1aSopenharmony_ci    for (;res--;) {
1291cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
1292cabdff1aSopenharmony_ci        src8 = __lsx_vldx(src, src_stride);
1293cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
1294cabdff1aSopenharmony_ci        src += src_stride_2x;
1295cabdff1aSopenharmony_ci        src8776 = __lsx_vilvl_d(src87_r, src76_r);
1296cabdff1aSopenharmony_ci
1297cabdff1aSopenharmony_ci        dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
1298cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
1299cabdff1aSopenharmony_ci                  filt2, dst10, dst10);
1300cabdff1aSopenharmony_ci        dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
1301cabdff1aSopenharmony_ci
1302cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst10, dst, 0, 0);
1303cabdff1aSopenharmony_ci        dst += dst_stride;
1304cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst10, dst, 0, 1);
1305cabdff1aSopenharmony_ci        dst += dst_stride;
1306cabdff1aSopenharmony_ci
1307cabdff1aSopenharmony_ci        src2110 = src4332;
1308cabdff1aSopenharmony_ci        src4332 = src6554;
1309cabdff1aSopenharmony_ci        src6554 = src8776;
1310cabdff1aSopenharmony_ci        src6 = src8;
1311cabdff1aSopenharmony_ci    }
1312cabdff1aSopenharmony_ci}
1313cabdff1aSopenharmony_ci
1314cabdff1aSopenharmony_cistatic void hevc_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
1315cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
1316cabdff1aSopenharmony_ci                              const int8_t *filter, int32_t height)
1317cabdff1aSopenharmony_ci{
1318cabdff1aSopenharmony_ci    int32_t loop_cnt;
1319cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1320cabdff1aSopenharmony_ci    int32_t dst_stride_x  = (dst_stride << 1);
1321cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1322cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 2);
1323cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1324cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
1325cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1326cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
1327cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
1328cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1329cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
1330cabdff1aSopenharmony_ci
1331cabdff1aSopenharmony_ci    src -= src_stride_3x;
1332cabdff1aSopenharmony_ci
1333cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1334cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
1335cabdff1aSopenharmony_ci
1336cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
1337cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
1338cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src, src_stride_3x);
1339cabdff1aSopenharmony_ci    src += src_stride_4x;
1340cabdff1aSopenharmony_ci    src4 = __lsx_vld(src, 0);
1341cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
1342cabdff1aSopenharmony_ci    src += src_stride_3x;
1343cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1344cabdff1aSopenharmony_ci              src10_r, src32_r, src54_r, src21_r);
1345cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
1346cabdff1aSopenharmony_ci
1347cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1348cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
1349cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
1350cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src, src_stride_3x);
1351cabdff1aSopenharmony_ci        src += src_stride_4x;
1352cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1353cabdff1aSopenharmony_ci                  src9, src76_r, src87_r, src98_r, src109_r);
1354cabdff1aSopenharmony_ci
1355cabdff1aSopenharmony_ci        dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
1356cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
1357cabdff1aSopenharmony_ci                  src54_r, filt2, dst0_r, dst0_r);
1358cabdff1aSopenharmony_ci        dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
1359cabdff1aSopenharmony_ci        dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
1360cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
1361cabdff1aSopenharmony_ci                  src65_r, filt2, dst1_r, dst1_r);
1362cabdff1aSopenharmony_ci        dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
1363cabdff1aSopenharmony_ci        dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
1364cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
1365cabdff1aSopenharmony_ci                  src76_r, filt2, dst2_r, dst2_r);
1366cabdff1aSopenharmony_ci        dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
1367cabdff1aSopenharmony_ci        dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
1368cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
1369cabdff1aSopenharmony_ci                  src87_r, filt2, dst3_r, dst3_r);
1370cabdff1aSopenharmony_ci        dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
1371cabdff1aSopenharmony_ci
1372cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
1373cabdff1aSopenharmony_ci        __lsx_vstx(dst1_r, dst, dst_stride_x);
1374cabdff1aSopenharmony_ci        __lsx_vstx(dst2_r, dst, dst_stride_2x);
1375cabdff1aSopenharmony_ci        __lsx_vstx(dst3_r, dst, dst_stride_3x);
1376cabdff1aSopenharmony_ci        dst += dst_stride_2x;
1377cabdff1aSopenharmony_ci
1378cabdff1aSopenharmony_ci        src10_r = src54_r;
1379cabdff1aSopenharmony_ci        src32_r = src76_r;
1380cabdff1aSopenharmony_ci        src54_r = src98_r;
1381cabdff1aSopenharmony_ci        src21_r = src65_r;
1382cabdff1aSopenharmony_ci        src43_r = src87_r;
1383cabdff1aSopenharmony_ci        src65_r = src109_r;
1384cabdff1aSopenharmony_ci        src6 = src10;
1385cabdff1aSopenharmony_ci    }
1386cabdff1aSopenharmony_ci}
1387cabdff1aSopenharmony_ci
1388cabdff1aSopenharmony_cistatic void hevc_vt_8t_12w_lsx(uint8_t *src, int32_t src_stride,
1389cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1390cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1391cabdff1aSopenharmony_ci{
1392cabdff1aSopenharmony_ci    int32_t loop_cnt;
1393cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1394cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1395cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1396cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1397cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
1398cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
1399cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1400cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
1401cabdff1aSopenharmony_ci    __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
1402cabdff1aSopenharmony_ci    __m128i src2110, src4332, src6554, src8776, src10998;
1403cabdff1aSopenharmony_ci    __m128i dst0_l, dst1_l;
1404cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
1405cabdff1aSopenharmony_ci
1406cabdff1aSopenharmony_ci    src -= src_stride_3x;
1407cabdff1aSopenharmony_ci
1408cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1409cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
1410cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
1411cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
1412cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src, src_stride_3x);
1413cabdff1aSopenharmony_ci    src += src_stride_4x;
1414cabdff1aSopenharmony_ci    src4 = __lsx_vld(src, 0);
1415cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
1416cabdff1aSopenharmony_ci    src += src_stride_3x;
1417cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1418cabdff1aSopenharmony_ci              src10_r, src32_r, src54_r, src21_r);
1419cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
1420cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
1421cabdff1aSopenharmony_ci              src10_l, src32_l, src54_l, src21_l);
1422cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
1423cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l,
1424cabdff1aSopenharmony_ci              src2110, src4332);
1425cabdff1aSopenharmony_ci    src6554 = __lsx_vilvl_d(src65_l, src54_l);
1426cabdff1aSopenharmony_ci
1427cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
1428cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
1429cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
1430cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src, src_stride_3x);
1431cabdff1aSopenharmony_ci        src += src_stride_4x;
1432cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1433cabdff1aSopenharmony_ci                  src9, src76_r, src87_r, src98_r, src109_r);
1434cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
1435cabdff1aSopenharmony_ci                  src9, src76_l, src87_l, src98_l, src109_l);
1436cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_d, src87_l, src76_l, src109_l, src98_l,
1437cabdff1aSopenharmony_ci                  src8776, src10998);
1438cabdff1aSopenharmony_ci
1439cabdff1aSopenharmony_ci        dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
1440cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
1441cabdff1aSopenharmony_ci                  src54_r, filt2, dst0_r, dst0_r);
1442cabdff1aSopenharmony_ci        dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
1443cabdff1aSopenharmony_ci        dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
1444cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
1445cabdff1aSopenharmony_ci                  src65_r, filt2, dst1_r, dst1_r);
1446cabdff1aSopenharmony_ci        dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
1447cabdff1aSopenharmony_ci        dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
1448cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
1449cabdff1aSopenharmony_ci                  src76_r, filt2, dst2_r, dst2_r);
1450cabdff1aSopenharmony_ci        dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
1451cabdff1aSopenharmony_ci        dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
1452cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
1453cabdff1aSopenharmony_ci                  src87_r, filt2, dst3_r, dst3_r);
1454cabdff1aSopenharmony_ci        dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
1455cabdff1aSopenharmony_ci        dst0_l = __lsx_vdp2_h_bu_b(src2110, filt0);
1456cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src4332, filt1, dst0_l,
1457cabdff1aSopenharmony_ci                  src6554, filt2, dst0_l, dst0_l);
1458cabdff1aSopenharmony_ci        dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src8776, filt3);
1459cabdff1aSopenharmony_ci        dst1_l = __lsx_vdp2_h_bu_b(src4332, filt0);
1460cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src6554, filt1, dst1_l,
1461cabdff1aSopenharmony_ci                  src8776, filt2, dst1_l, dst1_l);
1462cabdff1aSopenharmony_ci        dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src10998, filt3);
1463cabdff1aSopenharmony_ci
1464cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
1465cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_l, dst, 16, 0);
1466cabdff1aSopenharmony_ci        dst += dst_stride;
1467cabdff1aSopenharmony_ci        __lsx_vst(dst1_r, dst, 0);
1468cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_l, dst, 16, 1);
1469cabdff1aSopenharmony_ci        dst += dst_stride;
1470cabdff1aSopenharmony_ci        __lsx_vst(dst2_r, dst, 0);
1471cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1_l, dst, 16, 0);
1472cabdff1aSopenharmony_ci        dst += dst_stride;
1473cabdff1aSopenharmony_ci        __lsx_vst(dst3_r, dst, 0);
1474cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst1_l, dst, 16, 1);
1475cabdff1aSopenharmony_ci        dst += dst_stride;
1476cabdff1aSopenharmony_ci
1477cabdff1aSopenharmony_ci        src10_r = src54_r;
1478cabdff1aSopenharmony_ci        src32_r = src76_r;
1479cabdff1aSopenharmony_ci        src54_r = src98_r;
1480cabdff1aSopenharmony_ci        src21_r = src65_r;
1481cabdff1aSopenharmony_ci        src43_r = src87_r;
1482cabdff1aSopenharmony_ci        src65_r = src109_r;
1483cabdff1aSopenharmony_ci        src2110 = src6554;
1484cabdff1aSopenharmony_ci        src4332 = src8776;
1485cabdff1aSopenharmony_ci        src6554 = src10998;
1486cabdff1aSopenharmony_ci        src6 = src10;
1487cabdff1aSopenharmony_ci    }
1488cabdff1aSopenharmony_ci}
1489cabdff1aSopenharmony_ci
1490cabdff1aSopenharmony_cistatic void hevc_vt_8t_16multx4mult_lsx(uint8_t *src,
1491cabdff1aSopenharmony_ci                                        int32_t src_stride,
1492cabdff1aSopenharmony_ci                                        int16_t *dst,
1493cabdff1aSopenharmony_ci                                        int32_t dst_stride,
1494cabdff1aSopenharmony_ci                                        const int8_t *filter,
1495cabdff1aSopenharmony_ci                                        int32_t height,
1496cabdff1aSopenharmony_ci                                        int32_t width)
1497cabdff1aSopenharmony_ci{
1498cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1499cabdff1aSopenharmony_ci    int16_t *dst_tmp;
1500cabdff1aSopenharmony_ci    int32_t loop_cnt, cnt;
1501cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1502cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1503cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1504cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1505cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
1506cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
1507cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1508cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
1509cabdff1aSopenharmony_ci    __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
1510cabdff1aSopenharmony_ci    __m128i dst0_l, dst1_l, dst2_l, dst3_l;
1511cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
1512cabdff1aSopenharmony_ci
1513cabdff1aSopenharmony_ci    src -= src_stride_3x;
1514cabdff1aSopenharmony_ci
1515cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1516cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
1517cabdff1aSopenharmony_ci
1518cabdff1aSopenharmony_ci    for (cnt = width >> 4; cnt--;) {
1519cabdff1aSopenharmony_ci        src_tmp = src;
1520cabdff1aSopenharmony_ci        dst_tmp = dst;
1521cabdff1aSopenharmony_ci
1522cabdff1aSopenharmony_ci        src0 = __lsx_vld(src_tmp, 0);
1523cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1524cabdff1aSopenharmony_ci                  src1, src2);
1525cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src_tmp, src_stride_3x);
1526cabdff1aSopenharmony_ci        src_tmp += src_stride_4x;
1527cabdff1aSopenharmony_ci        src4 = __lsx_vld(src_tmp, 0);
1528cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1529cabdff1aSopenharmony_ci                  src5, src6);
1530cabdff1aSopenharmony_ci        src_tmp += src_stride_3x;
1531cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1532cabdff1aSopenharmony_ci                  src10_r, src32_r, src54_r, src21_r);
1533cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
1534cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
1535cabdff1aSopenharmony_ci                  src10_l, src32_l, src54_l, src21_l);
1536cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
1537cabdff1aSopenharmony_ci
1538cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
1539cabdff1aSopenharmony_ci            src7 = __lsx_vld(src_tmp, 0);
1540cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1541cabdff1aSopenharmony_ci                      src8, src9);
1542cabdff1aSopenharmony_ci            src10 = __lsx_vldx(src_tmp, src_stride_3x);
1543cabdff1aSopenharmony_ci            src_tmp += src_stride_4x;
1544cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
1545cabdff1aSopenharmony_ci                      src10, src9, src76_r, src87_r, src98_r, src109_r);
1546cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
1547cabdff1aSopenharmony_ci                      src10, src9, src76_l, src87_l, src98_l, src109_l);
1548cabdff1aSopenharmony_ci
1549cabdff1aSopenharmony_ci            dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
1550cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
1551cabdff1aSopenharmony_ci                      src54_r, filt2, dst0_r, dst0_r);
1552cabdff1aSopenharmony_ci            dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
1553cabdff1aSopenharmony_ci            dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
1554cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
1555cabdff1aSopenharmony_ci                      src65_r, filt2, dst1_r, dst1_r);
1556cabdff1aSopenharmony_ci            dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
1557cabdff1aSopenharmony_ci            dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
1558cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
1559cabdff1aSopenharmony_ci                      src76_r, filt2, dst2_r, dst2_r);
1560cabdff1aSopenharmony_ci            dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
1561cabdff1aSopenharmony_ci            dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
1562cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
1563cabdff1aSopenharmony_ci                      src87_r, filt2, dst3_r, dst3_r);
1564cabdff1aSopenharmony_ci            dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
1565cabdff1aSopenharmony_ci            dst0_l = __lsx_vdp2_h_bu_b(src10_l, filt0);
1566cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src32_l, filt1, dst0_l,
1567cabdff1aSopenharmony_ci                      src54_l, filt2, dst0_l, dst0_l);
1568cabdff1aSopenharmony_ci            dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src76_l, filt3);
1569cabdff1aSopenharmony_ci            dst1_l = __lsx_vdp2_h_bu_b(src21_l, filt0);
1570cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src43_l, filt1, dst1_l,
1571cabdff1aSopenharmony_ci                      src65_l, filt2, dst1_l, dst1_l);
1572cabdff1aSopenharmony_ci            dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src87_l, filt3);
1573cabdff1aSopenharmony_ci            dst2_l = __lsx_vdp2_h_bu_b(src32_l, filt0);
1574cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_l, src54_l, filt1, dst2_l,
1575cabdff1aSopenharmony_ci                      src76_l, filt2, dst2_l, dst2_l);
1576cabdff1aSopenharmony_ci            dst2_l = __lsx_vdp2add_h_bu_b(dst2_l, src98_l, filt3);
1577cabdff1aSopenharmony_ci            dst3_l = __lsx_vdp2_h_bu_b(src43_l, filt0);
1578cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_l, src65_l, filt1, dst3_l,
1579cabdff1aSopenharmony_ci                      src87_l, filt2, dst3_l, dst3_l);
1580cabdff1aSopenharmony_ci            dst3_l = __lsx_vdp2add_h_bu_b(dst3_l, src109_l, filt3);
1581cabdff1aSopenharmony_ci
1582cabdff1aSopenharmony_ci            __lsx_vst(dst0_r, dst_tmp, 0);
1583cabdff1aSopenharmony_ci            __lsx_vst(dst0_l, dst_tmp, 16);
1584cabdff1aSopenharmony_ci            dst_tmp += dst_stride;
1585cabdff1aSopenharmony_ci            __lsx_vst(dst1_r, dst_tmp, 0);
1586cabdff1aSopenharmony_ci            __lsx_vst(dst1_l, dst_tmp, 16);
1587cabdff1aSopenharmony_ci            dst_tmp += dst_stride;
1588cabdff1aSopenharmony_ci            __lsx_vst(dst2_r, dst_tmp, 0);
1589cabdff1aSopenharmony_ci            __lsx_vst(dst2_l, dst_tmp, 16);
1590cabdff1aSopenharmony_ci            dst_tmp += dst_stride;
1591cabdff1aSopenharmony_ci            __lsx_vst(dst3_r, dst_tmp, 0);
1592cabdff1aSopenharmony_ci            __lsx_vst(dst3_l, dst_tmp, 16);
1593cabdff1aSopenharmony_ci            dst_tmp += dst_stride;
1594cabdff1aSopenharmony_ci
1595cabdff1aSopenharmony_ci            src10_r = src54_r;
1596cabdff1aSopenharmony_ci            src32_r = src76_r;
1597cabdff1aSopenharmony_ci            src54_r = src98_r;
1598cabdff1aSopenharmony_ci            src21_r = src65_r;
1599cabdff1aSopenharmony_ci            src43_r = src87_r;
1600cabdff1aSopenharmony_ci            src65_r = src109_r;
1601cabdff1aSopenharmony_ci            src10_l = src54_l;
1602cabdff1aSopenharmony_ci            src32_l = src76_l;
1603cabdff1aSopenharmony_ci            src54_l = src98_l;
1604cabdff1aSopenharmony_ci            src21_l = src65_l;
1605cabdff1aSopenharmony_ci            src43_l = src87_l;
1606cabdff1aSopenharmony_ci            src65_l = src109_l;
1607cabdff1aSopenharmony_ci            src6 = src10;
1608cabdff1aSopenharmony_ci        }
1609cabdff1aSopenharmony_ci        src += 16;
1610cabdff1aSopenharmony_ci        dst += 16;
1611cabdff1aSopenharmony_ci    }
1612cabdff1aSopenharmony_ci}
1613cabdff1aSopenharmony_ci
1614cabdff1aSopenharmony_cistatic void hevc_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride,
1615cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1616cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1617cabdff1aSopenharmony_ci{
1618cabdff1aSopenharmony_ci    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
1619cabdff1aSopenharmony_ci                                filter, height, 16);
1620cabdff1aSopenharmony_ci}
1621cabdff1aSopenharmony_ci
1622cabdff1aSopenharmony_cistatic void hevc_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
1623cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1624cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1625cabdff1aSopenharmony_ci{
1626cabdff1aSopenharmony_ci    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
1627cabdff1aSopenharmony_ci                                filter, height, 16);
1628cabdff1aSopenharmony_ci    hevc_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride,
1629cabdff1aSopenharmony_ci                      filter, height);
1630cabdff1aSopenharmony_ci}
1631cabdff1aSopenharmony_ci
1632cabdff1aSopenharmony_cistatic void hevc_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
1633cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1634cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1635cabdff1aSopenharmony_ci{
1636cabdff1aSopenharmony_ci    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
1637cabdff1aSopenharmony_ci                                filter, height, 32);
1638cabdff1aSopenharmony_ci}
1639cabdff1aSopenharmony_ci
1640cabdff1aSopenharmony_cistatic void hevc_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
1641cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1642cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1643cabdff1aSopenharmony_ci{
1644cabdff1aSopenharmony_ci    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
1645cabdff1aSopenharmony_ci                                filter, height, 48);
1646cabdff1aSopenharmony_ci}
1647cabdff1aSopenharmony_ci
1648cabdff1aSopenharmony_cistatic void hevc_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
1649cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1650cabdff1aSopenharmony_ci                               const int8_t *filter, int32_t height)
1651cabdff1aSopenharmony_ci{
1652cabdff1aSopenharmony_ci    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
1653cabdff1aSopenharmony_ci                                filter, height, 64);
1654cabdff1aSopenharmony_ci}
1655cabdff1aSopenharmony_ci
1656cabdff1aSopenharmony_cistatic void hevc_hv_8t_4w_lsx(uint8_t *src, int32_t src_stride,
1657cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
1658cabdff1aSopenharmony_ci                              const int8_t *filter_x, const int8_t *filter_y,
1659cabdff1aSopenharmony_ci                              int32_t height)
1660cabdff1aSopenharmony_ci{
1661cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1662cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1663cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1664cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1665cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1666cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
1667cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
1668cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
1669cabdff1aSopenharmony_ci    __m128i filter_vec;
1670cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1671cabdff1aSopenharmony_ci    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1672cabdff1aSopenharmony_ci    __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1673cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1674cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1675cabdff1aSopenharmony_ci    __m128i dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1676cabdff1aSopenharmony_ci    __m128i mask0;
1677cabdff1aSopenharmony_ci
1678cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 16);
1679cabdff1aSopenharmony_ci
1680cabdff1aSopenharmony_ci    src -= src_stride_3x + 3;
1681cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
1682cabdff1aSopenharmony_ci              filter_x, 6, filt0, filt1, filt2, filt3);
1683cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
1684cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1685cabdff1aSopenharmony_ci
1686cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
1687cabdff1aSopenharmony_ci              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
1688cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1689cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
1690cabdff1aSopenharmony_ci
1691cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
1692cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
1693cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src, src_stride_3x);
1694cabdff1aSopenharmony_ci    src += src_stride_4x;
1695cabdff1aSopenharmony_ci    src4 = __lsx_vld(src, 0);
1696cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
1697cabdff1aSopenharmony_ci    src += src_stride_3x;
1698cabdff1aSopenharmony_ci
1699cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask0, src3, src0, mask1, src3, src0,
1700cabdff1aSopenharmony_ci              mask2, src3, src0, mask3, vec0, vec1, vec2, vec3);
1701cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask0, src4, src1, mask1, src4, src1,
1702cabdff1aSopenharmony_ci              mask2, src4, src1, mask3, vec4, vec5, vec6, vec7);
1703cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask0, src5, src2, mask1, src5, src2,
1704cabdff1aSopenharmony_ci              mask2, src5, src2, mask3, vec8, vec9, vec10, vec11);
1705cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask0, src6, src3, mask1, src6, src3,
1706cabdff1aSopenharmony_ci              mask2, src6, src3, mask3, vec12, vec13, vec14, vec15);
1707cabdff1aSopenharmony_ci    dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
1708cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
1709cabdff1aSopenharmony_ci              dst30, dst30);
1710cabdff1aSopenharmony_ci    dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
1711cabdff1aSopenharmony_ci    dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
1712cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
1713cabdff1aSopenharmony_ci              dst41, dst41);
1714cabdff1aSopenharmony_ci    dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
1715cabdff1aSopenharmony_ci    dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
1716cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
1717cabdff1aSopenharmony_ci              dst52, dst52);
1718cabdff1aSopenharmony_ci    dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
1719cabdff1aSopenharmony_ci    dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
1720cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
1721cabdff1aSopenharmony_ci              dst63, dst63);
1722cabdff1aSopenharmony_ci    dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
1723cabdff1aSopenharmony_ci
1724cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
1725cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
1726cabdff1aSopenharmony_ci    dst32_r = __lsx_vilvl_h(dst63, dst52);
1727cabdff1aSopenharmony_ci    dst65_r = __lsx_vilvh_h(dst63, dst52);
1728cabdff1aSopenharmony_ci    dst66 = __lsx_vreplvei_d(dst63, 1);
1729cabdff1aSopenharmony_ci
1730cabdff1aSopenharmony_ci    for (loop_cnt = height >> 2; loop_cnt--;) {
1731cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
1732cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
1733cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src, src_stride_3x);
1734cabdff1aSopenharmony_ci        src += src_stride_4x;
1735cabdff1aSopenharmony_ci
1736cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask0, src9, src7, mask1, src9, src7,
1737cabdff1aSopenharmony_ci                  mask2, src9, src7, mask3, vec0, vec1, vec2, vec3);
1738cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask0, src10, src8, mask1, src10, src8,
1739cabdff1aSopenharmony_ci                  mask2, src10, src8, mask3, vec4, vec5, vec6, vec7);
1740cabdff1aSopenharmony_ci
1741cabdff1aSopenharmony_ci        dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
1742cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
1743cabdff1aSopenharmony_ci                  dst97, dst97);
1744cabdff1aSopenharmony_ci        dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
1745cabdff1aSopenharmony_ci        dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
1746cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
1747cabdff1aSopenharmony_ci                  filt2, dst108, dst108);
1748cabdff1aSopenharmony_ci        dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
1749cabdff1aSopenharmony_ci
1750cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
1751cabdff1aSopenharmony_ci        dst109_r = __lsx_vilvh_h(dst108, dst97);
1752cabdff1aSopenharmony_ci        dst66 = __lsx_vreplvei_d(dst97, 1);
1753cabdff1aSopenharmony_ci        dst98_r = __lsx_vilvl_h(dst66, dst108);
1754cabdff1aSopenharmony_ci
1755cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
1756cabdff1aSopenharmony_ci                  filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
1757cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
1758cabdff1aSopenharmony_ci                  filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
1759cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
1760cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
1761cabdff1aSopenharmony_ci                  filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
1762cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
1763cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
1764cabdff1aSopenharmony_ci                  filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
1765cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
1766cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
1767cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
1768cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1769cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_r, dst, 0, 0);
1770cabdff1aSopenharmony_ci        dst += dst_stride;
1771cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_r, dst, 0, 1);
1772cabdff1aSopenharmony_ci        dst += dst_stride;
1773cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2_r, dst, 0, 0);
1774cabdff1aSopenharmony_ci        dst += dst_stride;
1775cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2_r, dst, 0, 1);
1776cabdff1aSopenharmony_ci        dst += dst_stride;
1777cabdff1aSopenharmony_ci
1778cabdff1aSopenharmony_ci        dst10_r = dst54_r;
1779cabdff1aSopenharmony_ci        dst32_r = dst76_r;
1780cabdff1aSopenharmony_ci        dst54_r = dst98_r;
1781cabdff1aSopenharmony_ci        dst21_r = dst65_r;
1782cabdff1aSopenharmony_ci        dst43_r = dst87_r;
1783cabdff1aSopenharmony_ci        dst65_r = dst109_r;
1784cabdff1aSopenharmony_ci        dst66 = __lsx_vreplvei_d(dst108, 1);
1785cabdff1aSopenharmony_ci    }
1786cabdff1aSopenharmony_ci}
1787cabdff1aSopenharmony_ci
1788cabdff1aSopenharmony_cistatic void hevc_hv_8t_8multx1mult_lsx(uint8_t *src,
1789cabdff1aSopenharmony_ci                                       int32_t src_stride,
1790cabdff1aSopenharmony_ci                                       int16_t *dst,
1791cabdff1aSopenharmony_ci                                       int32_t dst_stride,
1792cabdff1aSopenharmony_ci                                       const int8_t *filter_x,
1793cabdff1aSopenharmony_ci                                       const int8_t *filter_y,
1794cabdff1aSopenharmony_ci                                       int32_t height,
1795cabdff1aSopenharmony_ci                                       int32_t width)
1796cabdff1aSopenharmony_ci{
1797cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1798cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1799cabdff1aSopenharmony_ci    int16_t *dst_tmp;
1800cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1801cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1802cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1803cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1804cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
1805cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
1806cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
1807cabdff1aSopenharmony_ci    __m128i filter_vec;
1808cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1809cabdff1aSopenharmony_ci    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1810cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1811cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l;
1812cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
1813cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
1814cabdff1aSopenharmony_ci    __m128i mask0 = {0x403030202010100, 0x807070606050504};
1815cabdff1aSopenharmony_ci
1816cabdff1aSopenharmony_ci    src -= src_stride_3x + 3;
1817cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
1818cabdff1aSopenharmony_ci              filter_x, 6, filt0, filt1, filt2, filt3);
1819cabdff1aSopenharmony_ci
1820cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
1821cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1822cabdff1aSopenharmony_ci
1823cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
1824cabdff1aSopenharmony_ci              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
1825cabdff1aSopenharmony_ci
1826cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1827cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
1828cabdff1aSopenharmony_ci
1829cabdff1aSopenharmony_ci    for (cnt = width >> 3; cnt--;) {
1830cabdff1aSopenharmony_ci        src_tmp = src;
1831cabdff1aSopenharmony_ci        dst_tmp = dst;
1832cabdff1aSopenharmony_ci        src0 = __lsx_vld(src_tmp, 0);
1833cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1834cabdff1aSopenharmony_ci                  src1, src2);
1835cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src_tmp, src_stride_3x);
1836cabdff1aSopenharmony_ci        src_tmp += src_stride_4x;
1837cabdff1aSopenharmony_ci        src4 = __lsx_vld(src_tmp, 0);
1838cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1839cabdff1aSopenharmony_ci                  src5, src6);
1840cabdff1aSopenharmony_ci        src_tmp += src_stride_3x;
1841cabdff1aSopenharmony_ci
1842cabdff1aSopenharmony_ci        /* row 0 row 1 row 2 row 3 */
1843cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
1844cabdff1aSopenharmony_ci                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
1845cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
1846cabdff1aSopenharmony_ci                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
1847cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
1848cabdff1aSopenharmony_ci                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
1849cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
1850cabdff1aSopenharmony_ci                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
1851cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
1852cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
1853cabdff1aSopenharmony_ci                  dst0, dst0);
1854cabdff1aSopenharmony_ci        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
1855cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
1856cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
1857cabdff1aSopenharmony_ci                  dst1, dst1);
1858cabdff1aSopenharmony_ci        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
1859cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
1860cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
1861cabdff1aSopenharmony_ci                  dst2, dst2);
1862cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
1863cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
1864cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
1865cabdff1aSopenharmony_ci                  dst3, dst3);
1866cabdff1aSopenharmony_ci        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
1867cabdff1aSopenharmony_ci
1868cabdff1aSopenharmony_ci        /* row 4 row 5 row 6 */
1869cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
1870cabdff1aSopenharmony_ci                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
1871cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
1872cabdff1aSopenharmony_ci                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
1873cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
1874cabdff1aSopenharmony_ci                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
1875cabdff1aSopenharmony_ci        dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
1876cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
1877cabdff1aSopenharmony_ci                  dst4, dst4);
1878cabdff1aSopenharmony_ci        dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
1879cabdff1aSopenharmony_ci        dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
1880cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
1881cabdff1aSopenharmony_ci                  dst5, dst5);
1882cabdff1aSopenharmony_ci        dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
1883cabdff1aSopenharmony_ci        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
1884cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
1885cabdff1aSopenharmony_ci                  dst6, dst6);
1886cabdff1aSopenharmony_ci        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
1887cabdff1aSopenharmony_ci
1888cabdff1aSopenharmony_ci        for (loop_cnt = height; loop_cnt--;) {
1889cabdff1aSopenharmony_ci            src7 = __lsx_vld(src_tmp, 0);
1890cabdff1aSopenharmony_ci            src_tmp += src_stride;
1891cabdff1aSopenharmony_ci
1892cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
1893cabdff1aSopenharmony_ci                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
1894cabdff1aSopenharmony_ci            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
1895cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
1896cabdff1aSopenharmony_ci                      filt2, dst7, dst7);
1897cabdff1aSopenharmony_ci            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
1898cabdff1aSopenharmony_ci
1899cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
1900cabdff1aSopenharmony_ci                      dst6, dst10_r, dst32_r, dst54_r, dst76_r);
1901cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
1902cabdff1aSopenharmony_ci                      dst6, dst10_l, dst32_l, dst54_l, dst76_l);
1903cabdff1aSopenharmony_ci
1904cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
1905cabdff1aSopenharmony_ci                      dst0_r, dst0_l);
1906cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
1907cabdff1aSopenharmony_ci                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
1908cabdff1aSopenharmony_ci                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
1909cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
1910cabdff1aSopenharmony_ci                      dst76_l, filt_h3, dst0_r, dst0_l);
1911cabdff1aSopenharmony_ci            dst0_r = __lsx_vsrai_w(dst0_r, 6);
1912cabdff1aSopenharmony_ci            dst0_l = __lsx_vsrai_w(dst0_l, 6);
1913cabdff1aSopenharmony_ci
1914cabdff1aSopenharmony_ci            dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
1915cabdff1aSopenharmony_ci            __lsx_vst(dst0_r, dst_tmp, 0);
1916cabdff1aSopenharmony_ci            dst_tmp += dst_stride;
1917cabdff1aSopenharmony_ci
1918cabdff1aSopenharmony_ci            dst0 = dst1;
1919cabdff1aSopenharmony_ci            dst1 = dst2;
1920cabdff1aSopenharmony_ci            dst2 = dst3;
1921cabdff1aSopenharmony_ci            dst3 = dst4;
1922cabdff1aSopenharmony_ci            dst4 = dst5;
1923cabdff1aSopenharmony_ci            dst5 = dst6;
1924cabdff1aSopenharmony_ci            dst6 = dst7;
1925cabdff1aSopenharmony_ci        }
1926cabdff1aSopenharmony_ci        src += 8;
1927cabdff1aSopenharmony_ci        dst += 8;
1928cabdff1aSopenharmony_ci    }
1929cabdff1aSopenharmony_ci}
1930cabdff1aSopenharmony_ci
1931cabdff1aSopenharmony_cistatic void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride,
1932cabdff1aSopenharmony_ci                              int16_t *dst, int32_t dst_stride,
1933cabdff1aSopenharmony_ci                              const int8_t *filter_x, const int8_t *filter_y,
1934cabdff1aSopenharmony_ci                              int32_t height)
1935cabdff1aSopenharmony_ci{
1936cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
1937cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 8);
1938cabdff1aSopenharmony_ci}
1939cabdff1aSopenharmony_ci
1940cabdff1aSopenharmony_cistatic void hevc_hv_8t_12w_lsx(uint8_t *src, int32_t src_stride,
1941cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
1942cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
1943cabdff1aSopenharmony_ci                               int32_t height)
1944cabdff1aSopenharmony_ci{
1945cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1946cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1947cabdff1aSopenharmony_ci    int16_t *dst_tmp;
1948cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
1949cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
1950cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
1951cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1952cabdff1aSopenharmony_ci    __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1953cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1954cabdff1aSopenharmony_ci    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1955cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1956cabdff1aSopenharmony_ci    __m128i filter_vec;
1957cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1958cabdff1aSopenharmony_ci    __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1959cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
1960cabdff1aSopenharmony_ci    __m128i dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
1961cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
1962cabdff1aSopenharmony_ci
1963cabdff1aSopenharmony_ci    src -= src_stride_3x + 3;
1964cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
1965cabdff1aSopenharmony_ci              filter_x, 6, filt0, filt1, filt2, filt3);
1966cabdff1aSopenharmony_ci
1967cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
1968cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1969cabdff1aSopenharmony_ci
1970cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
1971cabdff1aSopenharmony_ci              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
1972cabdff1aSopenharmony_ci
1973cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1974cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1975cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
1976cabdff1aSopenharmony_ci
1977cabdff1aSopenharmony_ci    src_tmp = src;
1978cabdff1aSopenharmony_ci    dst_tmp = dst;
1979cabdff1aSopenharmony_ci
1980cabdff1aSopenharmony_ci    src0 = __lsx_vld(src_tmp, 0);
1981cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1982cabdff1aSopenharmony_ci              src1, src2);
1983cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src_tmp, src_stride_3x);
1984cabdff1aSopenharmony_ci    src_tmp += src_stride_4x;
1985cabdff1aSopenharmony_ci    src4 = __lsx_vld(src_tmp, 0);
1986cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1987cabdff1aSopenharmony_ci              src5, src6);
1988cabdff1aSopenharmony_ci    src_tmp += src_stride_3x;
1989cabdff1aSopenharmony_ci
1990cabdff1aSopenharmony_ci    /* row 0 row 1 row 2 row 3 */
1991cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, src0,
1992cabdff1aSopenharmony_ci              mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
1993cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1,
1994cabdff1aSopenharmony_ci              mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
1995cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2,
1996cabdff1aSopenharmony_ci              mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
1997cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, src3,
1998cabdff1aSopenharmony_ci              mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
1999cabdff1aSopenharmony_ci    dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
2000cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
2001cabdff1aSopenharmony_ci              dst0, dst0);
2002cabdff1aSopenharmony_ci    dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
2003cabdff1aSopenharmony_ci    dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
2004cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
2005cabdff1aSopenharmony_ci              dst1, dst1);
2006cabdff1aSopenharmony_ci    dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
2007cabdff1aSopenharmony_ci    dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
2008cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
2009cabdff1aSopenharmony_ci              dst2, dst2);
2010cabdff1aSopenharmony_ci    dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
2011cabdff1aSopenharmony_ci    dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
2012cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
2013cabdff1aSopenharmony_ci              dst3, dst3);
2014cabdff1aSopenharmony_ci    dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
2015cabdff1aSopenharmony_ci
2016cabdff1aSopenharmony_ci    /* row 4 row 5 row 6 */
2017cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, src4,
2018cabdff1aSopenharmony_ci              mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
2019cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, src5,
2020cabdff1aSopenharmony_ci              mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
2021cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, src6,
2022cabdff1aSopenharmony_ci              mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
2023cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
2024cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
2025cabdff1aSopenharmony_ci              dst4, dst4);
2026cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
2027cabdff1aSopenharmony_ci    dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
2028cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
2029cabdff1aSopenharmony_ci              dst5, dst5);
2030cabdff1aSopenharmony_ci    dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
2031cabdff1aSopenharmony_ci    dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
2032cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
2033cabdff1aSopenharmony_ci              dst6, dst6);
2034cabdff1aSopenharmony_ci    dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
2035cabdff1aSopenharmony_ci
2036cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
2037cabdff1aSopenharmony_ci        src7 = __lsx_vld(src_tmp, 0);
2038cabdff1aSopenharmony_ci        src_tmp += src_stride;
2039cabdff1aSopenharmony_ci
2040cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
2041cabdff1aSopenharmony_ci                  src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
2042cabdff1aSopenharmony_ci        dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
2043cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
2044cabdff1aSopenharmony_ci                  dst7, dst7);
2045cabdff1aSopenharmony_ci        dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
2046cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
2047cabdff1aSopenharmony_ci                  dst10_r, dst32_r, dst54_r, dst76_r);
2048cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
2049cabdff1aSopenharmony_ci                  dst10_l, dst32_l, dst54_l, dst76_l);
2050cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
2051cabdff1aSopenharmony_ci                  dst0_r, dst0_l);
2052cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
2053cabdff1aSopenharmony_ci                  filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, dst54_l, filt_h2,
2054cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst0_r, dst0_l);
2055cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, dst76_l,
2056cabdff1aSopenharmony_ci                  filt_h3, dst0_r, dst0_l)
2057cabdff1aSopenharmony_ci        dst0_r = __lsx_vsrai_w(dst0_r, 6);
2058cabdff1aSopenharmony_ci        dst0_l = __lsx_vsrai_w(dst0_l, 6);
2059cabdff1aSopenharmony_ci
2060cabdff1aSopenharmony_ci        dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
2061cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst_tmp, 0);
2062cabdff1aSopenharmony_ci        dst_tmp += dst_stride;
2063cabdff1aSopenharmony_ci
2064cabdff1aSopenharmony_ci        dst0 = dst1;
2065cabdff1aSopenharmony_ci        dst1 = dst2;
2066cabdff1aSopenharmony_ci        dst2 = dst3;
2067cabdff1aSopenharmony_ci        dst3 = dst4;
2068cabdff1aSopenharmony_ci        dst4 = dst5;
2069cabdff1aSopenharmony_ci        dst5 = dst6;
2070cabdff1aSopenharmony_ci        dst6 = dst7;
2071cabdff1aSopenharmony_ci    }
2072cabdff1aSopenharmony_ci    src += 8;
2073cabdff1aSopenharmony_ci    dst += 8;
2074cabdff1aSopenharmony_ci
2075cabdff1aSopenharmony_ci    mask4 = __lsx_vld(ff_hevc_mask_arr, 16);
2076cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
2077cabdff1aSopenharmony_ci    mask7 = __lsx_vaddi_bu(mask4, 6);
2078cabdff1aSopenharmony_ci
2079cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
2080cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
2081cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src, src_stride_3x);
2082cabdff1aSopenharmony_ci    src += src_stride_4x;
2083cabdff1aSopenharmony_ci    src4 = __lsx_vld(src, 0);
2084cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
2085cabdff1aSopenharmony_ci    src += src_stride_3x;
2086cabdff1aSopenharmony_ci
2087cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask4, src3, src0, mask5, src3, src0,
2088cabdff1aSopenharmony_ci              mask6, src3, src0, mask7, vec0, vec1, vec2, vec3);
2089cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask4, src4, src1, mask5, src4, src1,
2090cabdff1aSopenharmony_ci              mask6, src4, src1, mask7, vec4, vec5, vec6, vec7);
2091cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask4, src5, src2, mask5, src5, src2,
2092cabdff1aSopenharmony_ci              mask6, src5, src2, mask7, vec8, vec9, vec10, vec11);
2093cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask4, src6, src3, mask5, src6, src3,
2094cabdff1aSopenharmony_ci              mask6, src6, src3, mask7, vec12, vec13, vec14, vec15);
2095cabdff1aSopenharmony_ci    dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
2096cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
2097cabdff1aSopenharmony_ci              dst30, dst30);
2098cabdff1aSopenharmony_ci    dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
2099cabdff1aSopenharmony_ci    dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
2100cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
2101cabdff1aSopenharmony_ci              dst41, dst41);
2102cabdff1aSopenharmony_ci    dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
2103cabdff1aSopenharmony_ci    dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
2104cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
2105cabdff1aSopenharmony_ci              dst52, dst52);
2106cabdff1aSopenharmony_ci    dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
2107cabdff1aSopenharmony_ci    dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
2108cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
2109cabdff1aSopenharmony_ci              dst63, dst63);
2110cabdff1aSopenharmony_ci    dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
2111cabdff1aSopenharmony_ci
2112cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
2113cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
2114cabdff1aSopenharmony_ci    dst32_r = __lsx_vilvl_h(dst63, dst52);
2115cabdff1aSopenharmony_ci    dst65_r = __lsx_vilvh_h(dst63, dst52);
2116cabdff1aSopenharmony_ci
2117cabdff1aSopenharmony_ci    dst66 = __lsx_vreplvei_d(dst63, 1);
2118cabdff1aSopenharmony_ci
2119cabdff1aSopenharmony_ci    for (loop_cnt = height >> 2; loop_cnt--;) {
2120cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
2121cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
2122cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src, src_stride_3x);
2123cabdff1aSopenharmony_ci        src += src_stride_4x;
2124cabdff1aSopenharmony_ci
2125cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask4, src9, src7, mask5, src9,
2126cabdff1aSopenharmony_ci                  src7, mask6, src9, src7, mask7, vec0, vec1, vec2, vec3);
2127cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask4, src10, src8, mask5, src10,
2128cabdff1aSopenharmony_ci                  src8, mask6, src10, src8, mask7, vec4, vec5, vec6, vec7);
2129cabdff1aSopenharmony_ci        dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
2130cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
2131cabdff1aSopenharmony_ci                  dst97, dst97);
2132cabdff1aSopenharmony_ci        dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
2133cabdff1aSopenharmony_ci        dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
2134cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
2135cabdff1aSopenharmony_ci                  filt2, dst108, dst108);
2136cabdff1aSopenharmony_ci        dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
2137cabdff1aSopenharmony_ci
2138cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
2139cabdff1aSopenharmony_ci        dst109_r = __lsx_vilvh_h(dst108, dst97);
2140cabdff1aSopenharmony_ci        dst66 = __lsx_vreplvei_d(dst97, 1);
2141cabdff1aSopenharmony_ci        dst98_r = __lsx_vilvl_h(dst66, dst108);
2142cabdff1aSopenharmony_ci
2143cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
2144cabdff1aSopenharmony_ci                  filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
2145cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
2146cabdff1aSopenharmony_ci                  filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
2147cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
2148cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
2149cabdff1aSopenharmony_ci                  filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
2150cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
2151cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
2152cabdff1aSopenharmony_ci                  filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
2153cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
2154cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
2155cabdff1aSopenharmony_ci                  dst0_r, dst1_r, dst2_r, dst3_r);
2156cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r,
2157cabdff1aSopenharmony_ci                  dst0_r, dst2_r);
2158cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_r, dst, 0, 0);
2159cabdff1aSopenharmony_ci        dst += dst_stride;
2160cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst0_r, dst, 0, 1);
2161cabdff1aSopenharmony_ci        dst += dst_stride;
2162cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2_r, dst, 0, 0);
2163cabdff1aSopenharmony_ci        dst += dst_stride;
2164cabdff1aSopenharmony_ci        __lsx_vstelm_d(dst2_r, dst, 0, 1);
2165cabdff1aSopenharmony_ci        dst += dst_stride;
2166cabdff1aSopenharmony_ci
2167cabdff1aSopenharmony_ci        dst10_r = dst54_r;
2168cabdff1aSopenharmony_ci        dst32_r = dst76_r;
2169cabdff1aSopenharmony_ci        dst54_r = dst98_r;
2170cabdff1aSopenharmony_ci        dst21_r = dst65_r;
2171cabdff1aSopenharmony_ci        dst43_r = dst87_r;
2172cabdff1aSopenharmony_ci        dst65_r = dst109_r;
2173cabdff1aSopenharmony_ci        dst66 = __lsx_vreplvei_d(dst108, 1);
2174cabdff1aSopenharmony_ci    }
2175cabdff1aSopenharmony_ci}
2176cabdff1aSopenharmony_ci
2177cabdff1aSopenharmony_cistatic void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride,
2178cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
2179cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
2180cabdff1aSopenharmony_ci                               int32_t height)
2181cabdff1aSopenharmony_ci{
2182cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
2183cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 16);
2184cabdff1aSopenharmony_ci}
2185cabdff1aSopenharmony_ci
2186cabdff1aSopenharmony_cistatic void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride,
2187cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
2188cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
2189cabdff1aSopenharmony_ci                               int32_t height)
2190cabdff1aSopenharmony_ci{
2191cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
2192cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 24);
2193cabdff1aSopenharmony_ci}
2194cabdff1aSopenharmony_ci
2195cabdff1aSopenharmony_cistatic void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride,
2196cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
2197cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
2198cabdff1aSopenharmony_ci                               int32_t height)
2199cabdff1aSopenharmony_ci{
2200cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
2201cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 32);
2202cabdff1aSopenharmony_ci}
2203cabdff1aSopenharmony_ci
2204cabdff1aSopenharmony_cistatic void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride,
2205cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
2206cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
2207cabdff1aSopenharmony_ci                               int32_t height)
2208cabdff1aSopenharmony_ci{
2209cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
2210cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 48);
2211cabdff1aSopenharmony_ci}
2212cabdff1aSopenharmony_ci
2213cabdff1aSopenharmony_cistatic void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride,
2214cabdff1aSopenharmony_ci                               int16_t *dst, int32_t dst_stride,
2215cabdff1aSopenharmony_ci                               const int8_t *filter_x, const int8_t *filter_y,
2216cabdff1aSopenharmony_ci                               int32_t height)
2217cabdff1aSopenharmony_ci{
2218cabdff1aSopenharmony_ci    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
2219cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 64);
2220cabdff1aSopenharmony_ci}
2221cabdff1aSopenharmony_ci
2222cabdff1aSopenharmony_cistatic void hevc_hz_4t_32w_lsx(uint8_t *src,
2223cabdff1aSopenharmony_ci                               int32_t src_stride,
2224cabdff1aSopenharmony_ci                               int16_t *dst,
2225cabdff1aSopenharmony_ci                               int32_t dst_stride,
2226cabdff1aSopenharmony_ci                               const int8_t *filter,
2227cabdff1aSopenharmony_ci                               int32_t height)
2228cabdff1aSopenharmony_ci{
2229cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2230cabdff1aSopenharmony_ci    __m128i src0, src1, src2;
2231cabdff1aSopenharmony_ci    __m128i filt0, filt1;
2232cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
2233cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
2234cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
2235cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3;
2236cabdff1aSopenharmony_ci
2237cabdff1aSopenharmony_ci    src -= 1;
2238cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
2239cabdff1aSopenharmony_ci
2240cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
2241cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 10);
2242cabdff1aSopenharmony_ci
2243cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
2244cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
2245cabdff1aSopenharmony_ci        src2 = __lsx_vld(src, 24);
2246cabdff1aSopenharmony_ci        src += src_stride;
2247cabdff1aSopenharmony_ci
2248cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2,
2249cabdff1aSopenharmony_ci                  vec0, vec1);
2250cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src2, src2, mask0,
2251cabdff1aSopenharmony_ci                  vec2, vec3);
2252cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
2253cabdff1aSopenharmony_ci                  vec3, filt0, dst0, dst1, dst2, dst3);
2254cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3,
2255cabdff1aSopenharmony_ci                  vec0, vec1);
2256cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask1, src2, src2, mask1,
2257cabdff1aSopenharmony_ci                  vec2, vec3);
2258cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
2259cabdff1aSopenharmony_ci                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
2260cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 0);
2261cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 16);
2262cabdff1aSopenharmony_ci        __lsx_vst(dst2, dst, 32);
2263cabdff1aSopenharmony_ci        __lsx_vst(dst3, dst, 48);
2264cabdff1aSopenharmony_ci        dst += dst_stride;
2265cabdff1aSopenharmony_ci    }
2266cabdff1aSopenharmony_ci}
2267cabdff1aSopenharmony_ci
2268cabdff1aSopenharmony_cistatic void hevc_vt_4t_16w_lsx(uint8_t *src,
2269cabdff1aSopenharmony_ci                               int32_t src_stride,
2270cabdff1aSopenharmony_ci                               int16_t *dst,
2271cabdff1aSopenharmony_ci                               int32_t dst_stride,
2272cabdff1aSopenharmony_ci                               const int8_t *filter,
2273cabdff1aSopenharmony_ci                               int32_t height)
2274cabdff1aSopenharmony_ci{
2275cabdff1aSopenharmony_ci    int32_t loop_cnt;
2276cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
2277cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
2278cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5;
2279cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src21_r, src43_r;
2280cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src21_l, src43_l;
2281cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst0_l, dst1_l;
2282cabdff1aSopenharmony_ci    __m128i filt0, filt1;
2283cabdff1aSopenharmony_ci
2284cabdff1aSopenharmony_ci    src -= src_stride;
2285cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
2286cabdff1aSopenharmony_ci
2287cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
2288cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
2289cabdff1aSopenharmony_ci    src += src_stride_3x;
2290cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
2291cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
2292cabdff1aSopenharmony_ci
2293cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2294cabdff1aSopenharmony_ci        src3 = __lsx_vld(src, 0);
2295cabdff1aSopenharmony_ci        src4 = __lsx_vldx(src, src_stride);
2296cabdff1aSopenharmony_ci        src += src_stride_2x;
2297cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
2298cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
2299cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
2300cabdff1aSopenharmony_ci                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2301cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
2302cabdff1aSopenharmony_ci                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
2303cabdff1aSopenharmony_ci                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2304cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
2305cabdff1aSopenharmony_ci        __lsx_vst(dst0_l, dst, 16);
2306cabdff1aSopenharmony_ci        dst += dst_stride;
2307cabdff1aSopenharmony_ci        __lsx_vst(dst1_r, dst, 0);
2308cabdff1aSopenharmony_ci        __lsx_vst(dst1_l, dst, 16);
2309cabdff1aSopenharmony_ci        dst += dst_stride;
2310cabdff1aSopenharmony_ci
2311cabdff1aSopenharmony_ci        src5 = __lsx_vld(src, 0);
2312cabdff1aSopenharmony_ci        src2 = __lsx_vldx(src, src_stride);
2313cabdff1aSopenharmony_ci        src += src_stride_2x;
2314cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
2315cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
2316cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
2317cabdff1aSopenharmony_ci                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2318cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
2319cabdff1aSopenharmony_ci                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
2320cabdff1aSopenharmony_ci                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2321cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
2322cabdff1aSopenharmony_ci        __lsx_vst(dst0_l, dst, 16);
2323cabdff1aSopenharmony_ci        dst += dst_stride;
2324cabdff1aSopenharmony_ci        __lsx_vst(dst1_r, dst, 0);
2325cabdff1aSopenharmony_ci        __lsx_vst(dst1_l, dst, 16);
2326cabdff1aSopenharmony_ci        dst += dst_stride;
2327cabdff1aSopenharmony_ci    }
2328cabdff1aSopenharmony_ci}
2329cabdff1aSopenharmony_ci
2330cabdff1aSopenharmony_cistatic void hevc_vt_4t_24w_lsx(uint8_t *src,
2331cabdff1aSopenharmony_ci                               int32_t src_stride,
2332cabdff1aSopenharmony_ci                               int16_t *dst,
2333cabdff1aSopenharmony_ci                               int32_t dst_stride,
2334cabdff1aSopenharmony_ci                               const int8_t *filter,
2335cabdff1aSopenharmony_ci                               int32_t height)
2336cabdff1aSopenharmony_ci{
2337cabdff1aSopenharmony_ci    int32_t loop_cnt;
2338cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
2339cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
2340cabdff1aSopenharmony_ci    uint8_t *_src;
2341cabdff1aSopenharmony_ci
2342cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5;
2343cabdff1aSopenharmony_ci    __m128i src6, src7, src8, src9, src10, src11;
2344cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src76_r, src98_r;
2345cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src87_r, src109_r;
2346cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
2347cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src21_l, src43_l;
2348cabdff1aSopenharmony_ci    __m128i dst0_l, dst1_l;
2349cabdff1aSopenharmony_ci    __m128i filt0, filt1;
2350cabdff1aSopenharmony_ci
2351cabdff1aSopenharmony_ci    src -= src_stride;
2352cabdff1aSopenharmony_ci    _src = src + 16;
2353cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
2354cabdff1aSopenharmony_ci
2355cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
2356cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
2357cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
2358cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
2359cabdff1aSopenharmony_ci
2360cabdff1aSopenharmony_ci    src6 = __lsx_vld(_src, 0);
2361cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
2362cabdff1aSopenharmony_ci    src  += src_stride_3x;
2363cabdff1aSopenharmony_ci    _src += src_stride_3x;
2364cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
2365cabdff1aSopenharmony_ci
2366cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2367cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
2368cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
2369cabdff1aSopenharmony_ci        src  += src_stride_2x;
2370cabdff1aSopenharmony_ci        _src += src_stride_2x;
2371cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
2372cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
2373cabdff1aSopenharmony_ci
2374cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
2375cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
2376cabdff1aSopenharmony_ci                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2377cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
2378cabdff1aSopenharmony_ci                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
2379cabdff1aSopenharmony_ci                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2380cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
2381cabdff1aSopenharmony_ci                  dst2_r, dst3_r);
2382cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
2383cabdff1aSopenharmony_ci                  src109_r, filt1, dst2_r, dst3_r);
2384cabdff1aSopenharmony_ci
2385cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
2386cabdff1aSopenharmony_ci        __lsx_vst(dst0_l, dst, 16);
2387cabdff1aSopenharmony_ci        __lsx_vst(dst2_r, dst, 32);
2388cabdff1aSopenharmony_ci        dst += dst_stride;
2389cabdff1aSopenharmony_ci        __lsx_vst(dst1_r, dst, 0);
2390cabdff1aSopenharmony_ci        __lsx_vst(dst1_l, dst, 16);
2391cabdff1aSopenharmony_ci        __lsx_vst(dst3_r, dst, 32);
2392cabdff1aSopenharmony_ci        dst += dst_stride;
2393cabdff1aSopenharmony_ci
2394cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
2395cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
2396cabdff1aSopenharmony_ci        src  += src_stride_2x;
2397cabdff1aSopenharmony_ci        _src += src_stride_2x;
2398cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
2399cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
2400cabdff1aSopenharmony_ci
2401cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
2402cabdff1aSopenharmony_ci
2403cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
2404cabdff1aSopenharmony_ci                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2405cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, src10_l,
2406cabdff1aSopenharmony_ci                  filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, filt1,
2407cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
2408cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
2409cabdff1aSopenharmony_ci                  dst2_r, dst3_r);
2410cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r, src87_r,
2411cabdff1aSopenharmony_ci                  filt1, dst2_r, dst3_r);
2412cabdff1aSopenharmony_ci
2413cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
2414cabdff1aSopenharmony_ci        __lsx_vst(dst0_l, dst, 16);
2415cabdff1aSopenharmony_ci        __lsx_vst(dst2_r, dst, 32);
2416cabdff1aSopenharmony_ci        dst += dst_stride;
2417cabdff1aSopenharmony_ci        __lsx_vst(dst1_r, dst, 0);
2418cabdff1aSopenharmony_ci        __lsx_vst(dst1_l, dst, 16);
2419cabdff1aSopenharmony_ci        __lsx_vst(dst3_r, dst, 32);
2420cabdff1aSopenharmony_ci        dst += dst_stride;
2421cabdff1aSopenharmony_ci    }
2422cabdff1aSopenharmony_ci}
2423cabdff1aSopenharmony_ci
2424cabdff1aSopenharmony_cistatic void hevc_vt_4t_32w_lsx(uint8_t *src,
2425cabdff1aSopenharmony_ci                               int32_t src_stride,
2426cabdff1aSopenharmony_ci                               int16_t *dst,
2427cabdff1aSopenharmony_ci                               int32_t dst_stride,
2428cabdff1aSopenharmony_ci                               const int8_t *filter,
2429cabdff1aSopenharmony_ci                               int32_t height)
2430cabdff1aSopenharmony_ci{
2431cabdff1aSopenharmony_ci    int32_t loop_cnt;
2432cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
2433cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
2434cabdff1aSopenharmony_ci    uint8_t *_src;
2435cabdff1aSopenharmony_ci
2436cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5;
2437cabdff1aSopenharmony_ci    __m128i src6, src7, src8, src9, src10, src11;
2438cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src76_r, src98_r;
2439cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src87_r, src109_r;
2440cabdff1aSopenharmony_ci    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
2441cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src76_l, src98_l;
2442cabdff1aSopenharmony_ci    __m128i src21_l, src43_l, src87_l, src109_l;
2443cabdff1aSopenharmony_ci    __m128i dst0_l, dst1_l, dst2_l, dst3_l;
2444cabdff1aSopenharmony_ci    __m128i filt0, filt1;
2445cabdff1aSopenharmony_ci
2446cabdff1aSopenharmony_ci    src -= src_stride;
2447cabdff1aSopenharmony_ci    _src = src + 16;
2448cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
2449cabdff1aSopenharmony_ci
2450cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
2451cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
2452cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
2453cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
2454cabdff1aSopenharmony_ci
2455cabdff1aSopenharmony_ci    src6 = __lsx_vld(_src, 0);
2456cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
2457cabdff1aSopenharmony_ci    src  += src_stride_3x;
2458cabdff1aSopenharmony_ci    _src += src_stride_3x;
2459cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
2460cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
2461cabdff1aSopenharmony_ci
2462cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
2463cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
2464cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
2465cabdff1aSopenharmony_ci        src  += src_stride_2x;
2466cabdff1aSopenharmony_ci        _src += src_stride_2x;
2467cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
2468cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
2469cabdff1aSopenharmony_ci
2470cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
2471cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
2472cabdff1aSopenharmony_ci
2473cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
2474cabdff1aSopenharmony_ci                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2475cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
2476cabdff1aSopenharmony_ci                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l,src43_l,
2477cabdff1aSopenharmony_ci                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2478cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
2479cabdff1aSopenharmony_ci                  filt0, src87_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
2480cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst2_l, src98_l,
2481cabdff1aSopenharmony_ci                  filt1, dst3_r, src109_r, filt1, dst3_l, src109_l, filt1,
2482cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
2483cabdff1aSopenharmony_ci
2484cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
2485cabdff1aSopenharmony_ci        __lsx_vst(dst0_l, dst, 16);
2486cabdff1aSopenharmony_ci        __lsx_vst(dst2_r, dst, 32);
2487cabdff1aSopenharmony_ci        __lsx_vst(dst2_l, dst, 48);
2488cabdff1aSopenharmony_ci        dst += dst_stride;
2489cabdff1aSopenharmony_ci        __lsx_vst(dst1_r, dst, 0);
2490cabdff1aSopenharmony_ci        __lsx_vst(dst1_l, dst, 16);
2491cabdff1aSopenharmony_ci        __lsx_vst(dst3_r, dst, 32);
2492cabdff1aSopenharmony_ci        __lsx_vst(dst3_l, dst, 48);
2493cabdff1aSopenharmony_ci        dst += dst_stride;
2494cabdff1aSopenharmony_ci
2495cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
2496cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
2497cabdff1aSopenharmony_ci        src  += src_stride_2x;
2498cabdff1aSopenharmony_ci        _src += src_stride_2x;
2499cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
2500cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
2501cabdff1aSopenharmony_ci
2502cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
2503cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src11, src10, src8, src11, src76_l, src87_l);
2504cabdff1aSopenharmony_ci
2505cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
2506cabdff1aSopenharmony_ci                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
2507cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
2508cabdff1aSopenharmony_ci                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
2509cabdff1aSopenharmony_ci                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
2510cabdff1aSopenharmony_ci
2511cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src98_l, filt0, src109_r,
2512cabdff1aSopenharmony_ci                  filt0, src109_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
2513cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst2_l, src76_l,
2514cabdff1aSopenharmony_ci                  filt1, dst3_r, src87_r, filt1, dst3_l, src87_l, filt1,
2515cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
2516cabdff1aSopenharmony_ci
2517cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
2518cabdff1aSopenharmony_ci        __lsx_vst(dst0_l, dst, 16);
2519cabdff1aSopenharmony_ci        __lsx_vst(dst2_r, dst, 32);
2520cabdff1aSopenharmony_ci        __lsx_vst(dst2_l, dst, 48);
2521cabdff1aSopenharmony_ci        dst += dst_stride;
2522cabdff1aSopenharmony_ci        __lsx_vst(dst1_r, dst, 0);
2523cabdff1aSopenharmony_ci        __lsx_vst(dst1_l, dst, 16);
2524cabdff1aSopenharmony_ci        __lsx_vst(dst3_r, dst, 32);
2525cabdff1aSopenharmony_ci        __lsx_vst(dst3_l, dst, 48);
2526cabdff1aSopenharmony_ci        dst += dst_stride;
2527cabdff1aSopenharmony_ci    }
2528cabdff1aSopenharmony_ci}
2529cabdff1aSopenharmony_ci
2530cabdff1aSopenharmony_cistatic void hevc_hv_4t_8x2_lsx(uint8_t *src,
2531cabdff1aSopenharmony_ci                               int32_t src_stride,
2532cabdff1aSopenharmony_ci                               int16_t *dst,
2533cabdff1aSopenharmony_ci                               int32_t dst_stride,
2534cabdff1aSopenharmony_ci                               const int8_t *filter_x,
2535cabdff1aSopenharmony_ci                               const int8_t *filter_y)
2536cabdff1aSopenharmony_ci{
2537cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
2538cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
2539cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
2540cabdff1aSopenharmony_ci
2541cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4;
2542cabdff1aSopenharmony_ci    __m128i filt0, filt1;
2543cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1;
2544cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
2545cabdff1aSopenharmony_ci    __m128i mask1;
2546cabdff1aSopenharmony_ci    __m128i filter_vec;
2547cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2548cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4;
2549cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
2550cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
2551cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
2552cabdff1aSopenharmony_ci
2553cabdff1aSopenharmony_ci    src -= (src_stride + 1);
2554cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2555cabdff1aSopenharmony_ci
2556cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
2557cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2558cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2559cabdff1aSopenharmony_ci
2560cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
2561cabdff1aSopenharmony_ci
2562cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
2563cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
2564cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src, src_stride_3x);
2565cabdff1aSopenharmony_ci    src4 = __lsx_vldx(src, src_stride_4x);
2566cabdff1aSopenharmony_ci
2567cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
2568cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
2569cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
2570cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
2571cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
2572cabdff1aSopenharmony_ci
2573cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b,  vec0, filt0, vec2, filt0, vec4, filt0,
2574cabdff1aSopenharmony_ci              vec6, filt0, dst0, dst1, dst2, dst3);
2575cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2576cabdff1aSopenharmony_ci              dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
2577cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
2578cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
2579cabdff1aSopenharmony_ci
2580cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2581cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2582cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
2583cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
2584cabdff1aSopenharmony_ci
2585cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2586cabdff1aSopenharmony_ci              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2587cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
2588cabdff1aSopenharmony_ci              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
2589cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
2590cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2591cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
2592cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2593cabdff1aSopenharmony_ci    __lsx_vst(dst0_r, dst, 0);
2594cabdff1aSopenharmony_ci    __lsx_vst(dst1_r, dst + dst_stride, 0);
2595cabdff1aSopenharmony_ci}
2596cabdff1aSopenharmony_ci
2597cabdff1aSopenharmony_cistatic void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride,
2598cabdff1aSopenharmony_ci                                   int16_t *dst, int32_t dst_stride,
2599cabdff1aSopenharmony_ci                                   const int8_t *filter_x,
2600cabdff1aSopenharmony_ci                                   const int8_t *filter_y, int32_t width8mult)
2601cabdff1aSopenharmony_ci{
2602cabdff1aSopenharmony_ci    int32_t cnt;
2603cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
2604cabdff1aSopenharmony_ci    int32_t dst_stride_x  = (dst_stride << 1);
2605cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
2606cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 2);
2607cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
2608cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
2609cabdff1aSopenharmony_ci
2610cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
2611cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2612cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
2613cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
2614cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
2615cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
2616cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2617cabdff1aSopenharmony_ci
2618cabdff1aSopenharmony_ci    src -= (src_stride + 1);
2619cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2620cabdff1aSopenharmony_ci
2621cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
2622cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2623cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2624cabdff1aSopenharmony_ci
2625cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
2626cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
2627cabdff1aSopenharmony_ci
2628cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
2629cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
2630cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
2631cabdff1aSopenharmony_ci                  src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
2632cabdff1aSopenharmony_ci        src += src_stride_4x;
2633cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
2634cabdff1aSopenharmony_ci        src += (8 - src_stride_4x);
2635cabdff1aSopenharmony_ci
2636cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
2637cabdff1aSopenharmony_ci                  vec0, vec1);
2638cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
2639cabdff1aSopenharmony_ci                  vec2, vec3);
2640cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
2641cabdff1aSopenharmony_ci                  vec4, vec5);
2642cabdff1aSopenharmony_ci
2643cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
2644cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2645cabdff1aSopenharmony_ci                  dst0, dst1);
2646cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
2647cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
2648cabdff1aSopenharmony_ci
2649cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2650cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2651cabdff1aSopenharmony_ci
2652cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
2653cabdff1aSopenharmony_ci                  vec0, vec1);
2654cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
2655cabdff1aSopenharmony_ci                  vec2, vec3);
2656cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
2657cabdff1aSopenharmony_ci                  vec4, vec5);
2658cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
2659cabdff1aSopenharmony_ci                  vec6, vec7);
2660cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
2661cabdff1aSopenharmony_ci                  vec6, filt0, dst3, dst4, dst5, dst6);
2662cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
2663cabdff1aSopenharmony_ci                  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
2664cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
2665cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
2666cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
2667cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
2668cabdff1aSopenharmony_ci
2669cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2670cabdff1aSopenharmony_ci                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2671cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
2672cabdff1aSopenharmony_ci                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
2673cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
2674cabdff1aSopenharmony_ci
2675cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
2676cabdff1aSopenharmony_ci                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
2677cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
2678cabdff1aSopenharmony_ci                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
2679cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
2680cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2681cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
2682cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
2683cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
2684cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
2685cabdff1aSopenharmony_ci                  dst0_r, dst1_r);
2686cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vpickev_h, dst2_l, dst2_r, dst3_l, dst3_r,
2687cabdff1aSopenharmony_ci                  dst2_r, dst3_r);
2688cabdff1aSopenharmony_ci
2689cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst, 0);
2690cabdff1aSopenharmony_ci        __lsx_vstx(dst1_r, dst, dst_stride_x);
2691cabdff1aSopenharmony_ci        __lsx_vstx(dst2_r, dst, dst_stride_2x);
2692cabdff1aSopenharmony_ci        __lsx_vstx(dst3_r, dst, dst_stride_3x);
2693cabdff1aSopenharmony_ci        dst += 8;
2694cabdff1aSopenharmony_ci    }
2695cabdff1aSopenharmony_ci}
2696cabdff1aSopenharmony_ci
2697cabdff1aSopenharmony_cistatic void hevc_hv_4t_8x6_lsx(uint8_t *src,
2698cabdff1aSopenharmony_ci                               int32_t src_stride,
2699cabdff1aSopenharmony_ci                               int16_t *dst,
2700cabdff1aSopenharmony_ci                               int32_t dst_stride,
2701cabdff1aSopenharmony_ci                               const int8_t *filter_x,
2702cabdff1aSopenharmony_ci                               const int8_t *filter_y)
2703cabdff1aSopenharmony_ci{
2704cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
2705cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
2706cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
2707cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
2708cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
2709cabdff1aSopenharmony_ci    __m128i filt0, filt1;
2710cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1;
2711cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
2712cabdff1aSopenharmony_ci    __m128i mask1, filter_vec;
2713cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2714cabdff1aSopenharmony_ci    __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
2715cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2716cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2717cabdff1aSopenharmony_ci    __m128i dst4_r, dst4_l, dst5_r, dst5_l;
2718cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst10_l, dst32_l;
2719cabdff1aSopenharmony_ci    __m128i dst21_r, dst43_r, dst21_l, dst43_l;
2720cabdff1aSopenharmony_ci    __m128i dst54_r, dst54_l, dst65_r, dst65_l;
2721cabdff1aSopenharmony_ci    __m128i dst76_r, dst76_l, dst87_r, dst87_l;
2722cabdff1aSopenharmony_ci
2723cabdff1aSopenharmony_ci    src -= (src_stride + 1);
2724cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2725cabdff1aSopenharmony_ci
2726cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
2727cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2728cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2729cabdff1aSopenharmony_ci
2730cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
2731cabdff1aSopenharmony_ci
2732cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
2733cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
2734cabdff1aSopenharmony_ci              src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
2735cabdff1aSopenharmony_ci    src += src_stride_4x;
2736cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
2737cabdff1aSopenharmony_ci              src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
2738cabdff1aSopenharmony_ci
2739cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
2740cabdff1aSopenharmony_ci              mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
2741cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,src3, src3,
2742cabdff1aSopenharmony_ci              mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
2743cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
2744cabdff1aSopenharmony_ci              mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
2745cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
2746cabdff1aSopenharmony_ci              mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
2747cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1,
2748cabdff1aSopenharmony_ci              vec16, vec17);
2749cabdff1aSopenharmony_ci
2750cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
2751cabdff1aSopenharmony_ci              filt0, dst0, dst1, dst2, dst3);
2752cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2753cabdff1aSopenharmony_ci              dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
2754cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b,  vec8, filt0, vec10, filt0, vec12, filt0,
2755cabdff1aSopenharmony_ci              vec14, filt0, dst4, dst5, dst6, dst7);
2756cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
2757cabdff1aSopenharmony_ci              vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
2758cabdff1aSopenharmony_ci    dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
2759cabdff1aSopenharmony_ci    dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
2760cabdff1aSopenharmony_ci
2761cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
2762cabdff1aSopenharmony_ci              dst10_r, dst21_r, dst32_r, dst43_r);
2763cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
2764cabdff1aSopenharmony_ci              dst10_l, dst21_l, dst32_l, dst43_l);
2765cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
2766cabdff1aSopenharmony_ci              dst54_r, dst65_r, dst76_r, dst87_r);
2767cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
2768cabdff1aSopenharmony_ci              dst54_l, dst65_l, dst76_l, dst87_l);
2769cabdff1aSopenharmony_ci
2770cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2771cabdff1aSopenharmony_ci              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2772cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
2773cabdff1aSopenharmony_ci              filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
2774cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
2775cabdff1aSopenharmony_ci              filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
2776cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
2777cabdff1aSopenharmony_ci              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
2778cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
2779cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
2780cabdff1aSopenharmony_ci              filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
2781cabdff1aSopenharmony_ci              dst2_r, dst2_l, dst3_r, dst3_l);
2782cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
2783cabdff1aSopenharmony_ci              filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
2784cabdff1aSopenharmony_ci              dst4_r, dst4_l, dst5_r, dst5_l);
2785cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, dst0_r,
2786cabdff1aSopenharmony_ci              dst0_l, dst1_r, dst1_l);
2787cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, dst2_r,
2788cabdff1aSopenharmony_ci              dst2_l, dst3_r, dst3_l);
2789cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6, dst4_r,
2790cabdff1aSopenharmony_ci              dst4_l, dst5_r, dst5_l);
2791cabdff1aSopenharmony_ci
2792cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpickev_h,dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
2793cabdff1aSopenharmony_ci              dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
2794cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
2795cabdff1aSopenharmony_ci
2796cabdff1aSopenharmony_ci    __lsx_vst(dst0_r, dst, 0);
2797cabdff1aSopenharmony_ci    __lsx_vstx(dst1_r, dst, dst_stride_2x);
2798cabdff1aSopenharmony_ci    dst += dst_stride_2x;
2799cabdff1aSopenharmony_ci    __lsx_vst(dst2_r, dst, 0);
2800cabdff1aSopenharmony_ci    __lsx_vstx(dst3_r, dst, dst_stride_2x);
2801cabdff1aSopenharmony_ci    dst += dst_stride_2x;
2802cabdff1aSopenharmony_ci    __lsx_vst(dst4_r, dst, 0);
2803cabdff1aSopenharmony_ci    __lsx_vstx(dst5_r, dst, dst_stride_2x);
2804cabdff1aSopenharmony_ci}
2805cabdff1aSopenharmony_ci
2806cabdff1aSopenharmony_cistatic void hevc_hv_4t_8multx4mult_lsx(uint8_t *src,
2807cabdff1aSopenharmony_ci                                       int32_t src_stride,
2808cabdff1aSopenharmony_ci                                       int16_t *dst,
2809cabdff1aSopenharmony_ci                                       int32_t dst_stride,
2810cabdff1aSopenharmony_ci                                       const int8_t *filter_x,
2811cabdff1aSopenharmony_ci                                       const int8_t *filter_y,
2812cabdff1aSopenharmony_ci                                       int32_t height,
2813cabdff1aSopenharmony_ci                                       int32_t width8mult)
2814cabdff1aSopenharmony_ci{
2815cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
2816cabdff1aSopenharmony_ci    uint8_t *src_tmp;
2817cabdff1aSopenharmony_ci    int16_t *dst_tmp;
2818cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
2819cabdff1aSopenharmony_ci    int32_t dst_stride_x  = (dst_stride << 1);
2820cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
2821cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 2);
2822cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
2823cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
2824cabdff1aSopenharmony_ci
2825cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6;
2826cabdff1aSopenharmony_ci    __m128i filt0, filt1;
2827cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1;
2828cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
2829cabdff1aSopenharmony_ci    __m128i mask1, filter_vec;
2830cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2831cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
2832cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2833cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
2834cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
2835cabdff1aSopenharmony_ci
2836cabdff1aSopenharmony_ci    src -= (src_stride + 1);
2837cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2838cabdff1aSopenharmony_ci
2839cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
2840cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2841cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2842cabdff1aSopenharmony_ci
2843cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
2844cabdff1aSopenharmony_ci
2845cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
2846cabdff1aSopenharmony_ci        src_tmp = src;
2847cabdff1aSopenharmony_ci        dst_tmp = dst;
2848cabdff1aSopenharmony_ci
2849cabdff1aSopenharmony_ci        src0 = __lsx_vld(src_tmp, 0);
2850cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
2851cabdff1aSopenharmony_ci                  src1, src2);
2852cabdff1aSopenharmony_ci        src_tmp += src_stride_3x;
2853cabdff1aSopenharmony_ci
2854cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
2855cabdff1aSopenharmony_ci                  vec0, vec1);
2856cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
2857cabdff1aSopenharmony_ci                  vec2, vec3);
2858cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
2859cabdff1aSopenharmony_ci                  vec4, vec5);
2860cabdff1aSopenharmony_ci
2861cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
2862cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
2863cabdff1aSopenharmony_ci                  dst0, dst1);
2864cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
2865cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
2866cabdff1aSopenharmony_ci
2867cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2868cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
2869cabdff1aSopenharmony_ci
2870cabdff1aSopenharmony_ci        for (loop_cnt = height >> 2; loop_cnt--;) {
2871cabdff1aSopenharmony_ci            src3 = __lsx_vld(src_tmp, 0);
2872cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
2873cabdff1aSopenharmony_ci                      src4, src5);
2874cabdff1aSopenharmony_ci            src6 = __lsx_vldx(src_tmp, src_stride_3x);
2875cabdff1aSopenharmony_ci            src_tmp += src_stride_4x;
2876cabdff1aSopenharmony_ci
2877cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
2878cabdff1aSopenharmony_ci                      vec0, vec1);
2879cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
2880cabdff1aSopenharmony_ci                      vec2, vec3);
2881cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
2882cabdff1aSopenharmony_ci                      vec4, vec5);
2883cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
2884cabdff1aSopenharmony_ci                      vec6, vec7);
2885cabdff1aSopenharmony_ci
2886cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
2887cabdff1aSopenharmony_ci                      vec6, filt0, dst3, dst4, dst5, dst6);
2888cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
2889cabdff1aSopenharmony_ci                      filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
2890cabdff1aSopenharmony_ci                      dst4, dst5, dst6);
2891cabdff1aSopenharmony_ci
2892cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
2893cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
2894cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
2895cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
2896cabdff1aSopenharmony_ci
2897cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
2898cabdff1aSopenharmony_ci                      filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
2899cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
2900cabdff1aSopenharmony_ci                      filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
2901cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
2902cabdff1aSopenharmony_ci                      dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
2903cabdff1aSopenharmony_ci                      dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
2904cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
2905cabdff1aSopenharmony_ci                      dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
2906cabdff1aSopenharmony_ci                      dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
2907cabdff1aSopenharmony_ci
2908cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2909cabdff1aSopenharmony_ci                      dst0_r, dst0_l, dst1_r, dst1_l);
2910cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
2911cabdff1aSopenharmony_ci                      dst2_r, dst2_l, dst3_r, dst3_l);
2912cabdff1aSopenharmony_ci
2913cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
2914cabdff1aSopenharmony_ci                      dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
2915cabdff1aSopenharmony_ci
2916cabdff1aSopenharmony_ci            __lsx_vst(dst0_r, dst_tmp, 0);
2917cabdff1aSopenharmony_ci            __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
2918cabdff1aSopenharmony_ci            __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
2919cabdff1aSopenharmony_ci            __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
2920cabdff1aSopenharmony_ci            dst_tmp += dst_stride_2x;
2921cabdff1aSopenharmony_ci
2922cabdff1aSopenharmony_ci            dst10_r = dst54_r;
2923cabdff1aSopenharmony_ci            dst10_l = dst54_l;
2924cabdff1aSopenharmony_ci            dst21_r = dst65_r;
2925cabdff1aSopenharmony_ci            dst21_l = dst65_l;
2926cabdff1aSopenharmony_ci            dst2 = dst6;
2927cabdff1aSopenharmony_ci        }
2928cabdff1aSopenharmony_ci        src += 8;
2929cabdff1aSopenharmony_ci        dst += 8;
2930cabdff1aSopenharmony_ci    }
2931cabdff1aSopenharmony_ci}
2932cabdff1aSopenharmony_ci
2933cabdff1aSopenharmony_cistatic void hevc_hv_4t_8w_lsx(uint8_t *src,
2934cabdff1aSopenharmony_ci                              int32_t src_stride,
2935cabdff1aSopenharmony_ci                              int16_t *dst,
2936cabdff1aSopenharmony_ci                              int32_t dst_stride,
2937cabdff1aSopenharmony_ci                              const int8_t *filter_x,
2938cabdff1aSopenharmony_ci                              const int8_t *filter_y,
2939cabdff1aSopenharmony_ci                              int32_t height)
2940cabdff1aSopenharmony_ci{
2941cabdff1aSopenharmony_ci
2942cabdff1aSopenharmony_ci    if (2 == height) {
2943cabdff1aSopenharmony_ci        hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride,
2944cabdff1aSopenharmony_ci                           filter_x, filter_y);
2945cabdff1aSopenharmony_ci    } else if (4 == height) {
2946cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
2947cabdff1aSopenharmony_ci                               filter_x, filter_y, 1);
2948cabdff1aSopenharmony_ci    } else if (6 == height) {
2949cabdff1aSopenharmony_ci        hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride,
2950cabdff1aSopenharmony_ci                           filter_x, filter_y);
2951cabdff1aSopenharmony_ci    } else if (0 == (height & 0x03)) {
2952cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
2953cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 1);
2954cabdff1aSopenharmony_ci    }
2955cabdff1aSopenharmony_ci}
2956cabdff1aSopenharmony_ci
2957cabdff1aSopenharmony_cistatic void hevc_hv_4t_12w_lsx(uint8_t *src,
2958cabdff1aSopenharmony_ci                               int32_t src_stride,
2959cabdff1aSopenharmony_ci                               int16_t *dst,
2960cabdff1aSopenharmony_ci                               int32_t dst_stride,
2961cabdff1aSopenharmony_ci                               const int8_t *filter_x,
2962cabdff1aSopenharmony_ci                               const int8_t *filter_y,
2963cabdff1aSopenharmony_ci                               int32_t height)
2964cabdff1aSopenharmony_ci{
2965cabdff1aSopenharmony_ci    uint32_t loop_cnt;
2966cabdff1aSopenharmony_ci    uint8_t *src_tmp;
2967cabdff1aSopenharmony_ci    int16_t *dst_tmp;
2968cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
2969cabdff1aSopenharmony_ci    int32_t dst_stride_x  = (dst_stride << 1);
2970cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
2971cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 2);
2972cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
2973cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
2974cabdff1aSopenharmony_ci
2975cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2976cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2977cabdff1aSopenharmony_ci    __m128i mask0, mask1, mask2, mask3;
2978cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, dst0;
2979cabdff1aSopenharmony_ci    __m128i dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
2980cabdff1aSopenharmony_ci    __m128i dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
2981cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
2982cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
2983cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
2984cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2985cabdff1aSopenharmony_ci
2986cabdff1aSopenharmony_ci    src -= (src_stride + 1);
2987cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
2988cabdff1aSopenharmony_ci
2989cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
2990cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
2991cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
2992cabdff1aSopenharmony_ci
2993cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
2994cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
2995cabdff1aSopenharmony_ci
2996cabdff1aSopenharmony_ci    src_tmp = src;
2997cabdff1aSopenharmony_ci    dst_tmp = dst;
2998cabdff1aSopenharmony_ci
2999cabdff1aSopenharmony_ci    src0 = __lsx_vld(src_tmp, 0);
3000cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
3001cabdff1aSopenharmony_ci              src1, src2);
3002cabdff1aSopenharmony_ci    src_tmp += src_stride_3x;
3003cabdff1aSopenharmony_ci
3004cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
3005cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
3006cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
3007cabdff1aSopenharmony_ci
3008cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
3009cabdff1aSopenharmony_ci    dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
3010cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
3011cabdff1aSopenharmony_ci              dst0, dst1);
3012cabdff1aSopenharmony_ci    dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
3013cabdff1aSopenharmony_ci
3014cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3015cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
3016cabdff1aSopenharmony_ci
3017cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
3018cabdff1aSopenharmony_ci        src3 = __lsx_vld(src_tmp, 0);
3019cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
3020cabdff1aSopenharmony_ci                  src4, src5);
3021cabdff1aSopenharmony_ci        src6 = __lsx_vldx(src_tmp, src_stride_3x);
3022cabdff1aSopenharmony_ci        src_tmp += src_stride_4x;
3023cabdff1aSopenharmony_ci
3024cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
3025cabdff1aSopenharmony_ci                  vec0, vec1);
3026cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
3027cabdff1aSopenharmony_ci                  vec2, vec3);
3028cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
3029cabdff1aSopenharmony_ci                  vec4, vec5);
3030cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
3031cabdff1aSopenharmony_ci                  vec6, vec7);
3032cabdff1aSopenharmony_ci
3033cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
3034cabdff1aSopenharmony_ci                  vec6, filt0, dst3, dst4, dst5, dst6);
3035cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
3036cabdff1aSopenharmony_ci                  filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
3037cabdff1aSopenharmony_ci                  dst4, dst5, dst6);
3038cabdff1aSopenharmony_ci
3039cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
3040cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
3041cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
3042cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
3043cabdff1aSopenharmony_ci
3044cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
3045cabdff1aSopenharmony_ci                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
3046cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
3047cabdff1aSopenharmony_ci                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
3048cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
3049cabdff1aSopenharmony_ci                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
3050cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
3051cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
3052cabdff1aSopenharmony_ci                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
3053cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
3054cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
3055cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
3056cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
3057cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
3058cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
3059cabdff1aSopenharmony_ci                  dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3060cabdff1aSopenharmony_ci        __lsx_vst(dst0_r, dst_tmp, 0);
3061cabdff1aSopenharmony_ci        __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
3062cabdff1aSopenharmony_ci        __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
3063cabdff1aSopenharmony_ci        __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
3064cabdff1aSopenharmony_ci        dst_tmp += dst_stride_2x;
3065cabdff1aSopenharmony_ci
3066cabdff1aSopenharmony_ci        dst10_r = dst54_r;
3067cabdff1aSopenharmony_ci        dst10_l = dst54_l;
3068cabdff1aSopenharmony_ci        dst21_r = dst65_r;
3069cabdff1aSopenharmony_ci        dst21_l = dst65_l;
3070cabdff1aSopenharmony_ci        dst2 = dst6;
3071cabdff1aSopenharmony_ci    }
3072cabdff1aSopenharmony_ci
3073cabdff1aSopenharmony_ci    src += 8;
3074cabdff1aSopenharmony_ci    dst += 8;
3075cabdff1aSopenharmony_ci
3076cabdff1aSopenharmony_ci    mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
3077cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask2, 2);
3078cabdff1aSopenharmony_ci
3079cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
3080cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
3081cabdff1aSopenharmony_ci    src += src_stride_3x;
3082cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
3083cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
3084cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
3085cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
3086cabdff1aSopenharmony_ci              dst10, dst21);
3087cabdff1aSopenharmony_ci    dst10_r = __lsx_vilvl_h(dst21, dst10);
3088cabdff1aSopenharmony_ci    dst21_r = __lsx_vilvh_h(dst21, dst10);
3089cabdff1aSopenharmony_ci    dst22 = __lsx_vreplvei_d(dst21, 1);
3090cabdff1aSopenharmony_ci
3091cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
3092cabdff1aSopenharmony_ci        src3 = __lsx_vld(src, 0);
3093cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
3094cabdff1aSopenharmony_ci        src6 = __lsx_vldx(src, src_stride_3x);
3095cabdff1aSopenharmony_ci        src += src_stride_4x;
3096cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
3097cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
3098cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src, src_stride_3x);
3099cabdff1aSopenharmony_ci        src += src_stride_4x;
3100cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3,
3101cabdff1aSopenharmony_ci                  vec0, vec1);
3102cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src8, src4, mask2, src8, src4, mask3,
3103cabdff1aSopenharmony_ci                  vec2, vec3);
3104cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3,
3105cabdff1aSopenharmony_ci                  vec4, vec5);
3106cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src10, src6, mask2, src10, src6, mask3,
3107cabdff1aSopenharmony_ci                  vec6, vec7);
3108cabdff1aSopenharmony_ci
3109cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
3110cabdff1aSopenharmony_ci                  vec6, filt0, dst73, dst84, dst95, dst106);
3111cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
3112cabdff1aSopenharmony_ci                  filt1, dst95, vec5, filt1, dst106, vec7, filt1, dst73,
3113cabdff1aSopenharmony_ci                  dst84, dst95, dst106);
3114cabdff1aSopenharmony_ci
3115cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst73, dst22, dst84, dst73, dst32_r, dst43_r);
3116cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
3117cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst95, dst84, dst106, dst95, dst54_r, dst65_r);
3118cabdff1aSopenharmony_ci        dst109_r = __lsx_vilvh_h(dst106, dst95);
3119cabdff1aSopenharmony_ci        dst22 = __lsx_vreplvei_d(dst73, 1);
3120cabdff1aSopenharmony_ci        dst76_r = __lsx_vilvl_h(dst22, dst106);
3121cabdff1aSopenharmony_ci
3122cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
3123cabdff1aSopenharmony_ci                  filt_h0, dst43_r, filt_h0, tmp0, tmp1, tmp2, tmp3);
3124cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
3125cabdff1aSopenharmony_ci                  filt_h0, dst87_r, filt_h0, tmp4, tmp5, tmp6, tmp7);
3126cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, tmp0, dst32_r, filt_h1, tmp1, dst43_r,
3127cabdff1aSopenharmony_ci                  filt_h1, tmp2, dst54_r, filt_h1, tmp3, dst65_r, filt_h1,
3128cabdff1aSopenharmony_ci                  tmp0, tmp1, tmp2, tmp3);
3129cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, tmp4, dst76_r, filt_h1, tmp5, dst87_r,
3130cabdff1aSopenharmony_ci                  filt_h1, tmp6, dst98_r, filt_h1, tmp7, dst109_r, filt_h1,
3131cabdff1aSopenharmony_ci                  tmp4, tmp5, tmp6, tmp7);
3132cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, tmp0, 6, tmp1, 6, tmp2, 6, tmp3, 6,
3133cabdff1aSopenharmony_ci                  tmp0, tmp1, tmp2, tmp3);
3134cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsrai_w, tmp4, 6, tmp5, 6, tmp6, 6, tmp7, 6,
3135cabdff1aSopenharmony_ci                  tmp4, tmp5, tmp6, tmp7);
3136cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4,
3137cabdff1aSopenharmony_ci                  tmp7, tmp6, tmp0, tmp1, tmp2, tmp3);
3138cabdff1aSopenharmony_ci
3139cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp0, dst, 0, 0);
3140cabdff1aSopenharmony_ci        dst += dst_stride;
3141cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp0, dst, 0, 1);
3142cabdff1aSopenharmony_ci        dst += dst_stride;
3143cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp1, dst, 0, 0);
3144cabdff1aSopenharmony_ci        dst += dst_stride;
3145cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp1, dst, 0, 1);
3146cabdff1aSopenharmony_ci        dst += dst_stride;
3147cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp2, dst, 0, 0);
3148cabdff1aSopenharmony_ci        dst += dst_stride;
3149cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp2, dst, 0, 1);
3150cabdff1aSopenharmony_ci        dst += dst_stride;
3151cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp3, dst, 0, 0);
3152cabdff1aSopenharmony_ci        dst += dst_stride;
3153cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp3, dst, 0, 1);
3154cabdff1aSopenharmony_ci        dst += dst_stride;
3155cabdff1aSopenharmony_ci
3156cabdff1aSopenharmony_ci        dst10_r = dst98_r;
3157cabdff1aSopenharmony_ci        dst21_r = dst109_r;
3158cabdff1aSopenharmony_ci        dst22 = __lsx_vreplvei_d(dst106, 1);
3159cabdff1aSopenharmony_ci    }
3160cabdff1aSopenharmony_ci}
3161cabdff1aSopenharmony_ci
3162cabdff1aSopenharmony_cistatic void hevc_hv_4t_16w_lsx(uint8_t *src,
3163cabdff1aSopenharmony_ci                               int32_t src_stride,
3164cabdff1aSopenharmony_ci                               int16_t *dst,
3165cabdff1aSopenharmony_ci                               int32_t dst_stride,
3166cabdff1aSopenharmony_ci                               const int8_t *filter_x,
3167cabdff1aSopenharmony_ci                               const int8_t *filter_y,
3168cabdff1aSopenharmony_ci                               int32_t height)
3169cabdff1aSopenharmony_ci{
3170cabdff1aSopenharmony_ci    if (4 == height) {
3171cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
3172cabdff1aSopenharmony_ci                               filter_x, filter_y, 2);
3173cabdff1aSopenharmony_ci    } else {
3174cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
3175cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 2);
3176cabdff1aSopenharmony_ci    }
3177cabdff1aSopenharmony_ci}
3178cabdff1aSopenharmony_ci
3179cabdff1aSopenharmony_cistatic void hevc_hv_4t_24w_lsx(uint8_t *src,
3180cabdff1aSopenharmony_ci                               int32_t src_stride,
3181cabdff1aSopenharmony_ci                               int16_t *dst,
3182cabdff1aSopenharmony_ci                               int32_t dst_stride,
3183cabdff1aSopenharmony_ci                               const int8_t *filter_x,
3184cabdff1aSopenharmony_ci                               const int8_t *filter_y,
3185cabdff1aSopenharmony_ci                               int32_t height)
3186cabdff1aSopenharmony_ci{
3187cabdff1aSopenharmony_ci    hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
3188cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 3);
3189cabdff1aSopenharmony_ci}
3190cabdff1aSopenharmony_ci
3191cabdff1aSopenharmony_cistatic void hevc_hv_4t_32w_lsx(uint8_t *src,
3192cabdff1aSopenharmony_ci                               int32_t src_stride,
3193cabdff1aSopenharmony_ci                               int16_t *dst,
3194cabdff1aSopenharmony_ci                               int32_t dst_stride,
3195cabdff1aSopenharmony_ci                               const int8_t *filter_x,
3196cabdff1aSopenharmony_ci                               const int8_t *filter_y,
3197cabdff1aSopenharmony_ci                               int32_t height)
3198cabdff1aSopenharmony_ci{
3199cabdff1aSopenharmony_ci    hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
3200cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 4);
3201cabdff1aSopenharmony_ci}
3202cabdff1aSopenharmony_ci
3203cabdff1aSopenharmony_ci#define MC_COPY(WIDTH)                                                    \
3204cabdff1aSopenharmony_civoid ff_hevc_put_hevc_pel_pixels##WIDTH##_8_lsx(int16_t *dst,             \
3205cabdff1aSopenharmony_ci                                                uint8_t *src,             \
3206cabdff1aSopenharmony_ci                                                ptrdiff_t src_stride,     \
3207cabdff1aSopenharmony_ci                                                int height,               \
3208cabdff1aSopenharmony_ci                                                intptr_t mx,              \
3209cabdff1aSopenharmony_ci                                                intptr_t my,              \
3210cabdff1aSopenharmony_ci                                                int width)                \
3211cabdff1aSopenharmony_ci{                                                                         \
3212cabdff1aSopenharmony_ci    hevc_copy_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, height);  \
3213cabdff1aSopenharmony_ci}
3214cabdff1aSopenharmony_ci
3215cabdff1aSopenharmony_ciMC_COPY(4);
3216cabdff1aSopenharmony_ciMC_COPY(6);
3217cabdff1aSopenharmony_ciMC_COPY(8);
3218cabdff1aSopenharmony_ciMC_COPY(12);
3219cabdff1aSopenharmony_ciMC_COPY(16);
3220cabdff1aSopenharmony_ciMC_COPY(24);
3221cabdff1aSopenharmony_ciMC_COPY(32);
3222cabdff1aSopenharmony_ciMC_COPY(48);
3223cabdff1aSopenharmony_ciMC_COPY(64);
3224cabdff1aSopenharmony_ci
3225cabdff1aSopenharmony_ci#undef MC_COPY
3226cabdff1aSopenharmony_ci
3227cabdff1aSopenharmony_ci#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
3228cabdff1aSopenharmony_civoid ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst,          \
3229cabdff1aSopenharmony_ci                                                   uint8_t *src,          \
3230cabdff1aSopenharmony_ci                                                   ptrdiff_t src_stride,  \
3231cabdff1aSopenharmony_ci                                                   int height,            \
3232cabdff1aSopenharmony_ci                                                   intptr_t mx,           \
3233cabdff1aSopenharmony_ci                                                   intptr_t my,           \
3234cabdff1aSopenharmony_ci                                                   int width)             \
3235cabdff1aSopenharmony_ci{                                                                         \
3236cabdff1aSopenharmony_ci    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];         \
3237cabdff1aSopenharmony_ci                                                                          \
3238cabdff1aSopenharmony_ci    hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst,           \
3239cabdff1aSopenharmony_ci                                          MAX_PB_SIZE, filter, height);   \
3240cabdff1aSopenharmony_ci}
3241cabdff1aSopenharmony_ci
3242cabdff1aSopenharmony_ciMC(qpel, h, 4, 8, hz, mx);
3243cabdff1aSopenharmony_ciMC(qpel, h, 8, 8, hz, mx);
3244cabdff1aSopenharmony_ciMC(qpel, h, 12, 8, hz, mx);
3245cabdff1aSopenharmony_ciMC(qpel, h, 16, 8, hz, mx);
3246cabdff1aSopenharmony_ciMC(qpel, h, 24, 8, hz, mx);
3247cabdff1aSopenharmony_ciMC(qpel, h, 32, 8, hz, mx);
3248cabdff1aSopenharmony_ciMC(qpel, h, 48, 8, hz, mx);
3249cabdff1aSopenharmony_ciMC(qpel, h, 64, 8, hz, mx);
3250cabdff1aSopenharmony_ci
3251cabdff1aSopenharmony_ciMC(qpel, v, 4, 8, vt, my);
3252cabdff1aSopenharmony_ciMC(qpel, v, 8, 8, vt, my);
3253cabdff1aSopenharmony_ciMC(qpel, v, 12, 8, vt, my);
3254cabdff1aSopenharmony_ciMC(qpel, v, 16, 8, vt, my);
3255cabdff1aSopenharmony_ciMC(qpel, v, 24, 8, vt, my);
3256cabdff1aSopenharmony_ciMC(qpel, v, 32, 8, vt, my);
3257cabdff1aSopenharmony_ciMC(qpel, v, 48, 8, vt, my);
3258cabdff1aSopenharmony_ciMC(qpel, v, 64, 8, vt, my);
3259cabdff1aSopenharmony_ci
3260cabdff1aSopenharmony_ciMC(epel, h, 32, 4, hz, mx);
3261cabdff1aSopenharmony_ci
3262cabdff1aSopenharmony_ciMC(epel, v, 16, 4, vt, my);
3263cabdff1aSopenharmony_ciMC(epel, v, 24, 4, vt, my);
3264cabdff1aSopenharmony_ciMC(epel, v, 32, 4, vt, my);
3265cabdff1aSopenharmony_ci
3266cabdff1aSopenharmony_ci#undef MC
3267cabdff1aSopenharmony_ci
3268cabdff1aSopenharmony_ci#define MC_HV(PEL, WIDTH, TAP)                                          \
3269cabdff1aSopenharmony_civoid ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_lsx(int16_t *dst,           \
3270cabdff1aSopenharmony_ci                                                uint8_t *src,           \
3271cabdff1aSopenharmony_ci                                                ptrdiff_t src_stride,   \
3272cabdff1aSopenharmony_ci                                                int height,             \
3273cabdff1aSopenharmony_ci                                                intptr_t mx,            \
3274cabdff1aSopenharmony_ci                                                intptr_t my,            \
3275cabdff1aSopenharmony_ci                                                int width)              \
3276cabdff1aSopenharmony_ci{                                                                       \
3277cabdff1aSopenharmony_ci    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];           \
3278cabdff1aSopenharmony_ci    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];           \
3279cabdff1aSopenharmony_ci                                                                        \
3280cabdff1aSopenharmony_ci    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE,  \
3281cabdff1aSopenharmony_ci                                          filter_x, filter_y, height);  \
3282cabdff1aSopenharmony_ci}
3283cabdff1aSopenharmony_ci
3284cabdff1aSopenharmony_ciMC_HV(qpel, 4, 8);
3285cabdff1aSopenharmony_ciMC_HV(qpel, 8, 8);
3286cabdff1aSopenharmony_ciMC_HV(qpel, 12, 8);
3287cabdff1aSopenharmony_ciMC_HV(qpel, 16, 8);
3288cabdff1aSopenharmony_ciMC_HV(qpel, 24, 8);
3289cabdff1aSopenharmony_ciMC_HV(qpel, 32, 8);
3290cabdff1aSopenharmony_ciMC_HV(qpel, 48, 8);
3291cabdff1aSopenharmony_ciMC_HV(qpel, 64, 8);
3292cabdff1aSopenharmony_ci
3293cabdff1aSopenharmony_ciMC_HV(epel, 8, 4);
3294cabdff1aSopenharmony_ciMC_HV(epel, 12, 4);
3295cabdff1aSopenharmony_ciMC_HV(epel, 16, 4);
3296cabdff1aSopenharmony_ciMC_HV(epel, 24, 4);
3297cabdff1aSopenharmony_ciMC_HV(epel, 32, 4);
3298cabdff1aSopenharmony_ci
3299cabdff1aSopenharmony_ci#undef MC_HV
3300