1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2022 Loongson Technology Corporation Limited
3cabdff1aSopenharmony_ci * Contributed by Lu Wang <wanglu@loongson.cn>
4cabdff1aSopenharmony_ci *                Hao Chen <chenhao@loongson.cn>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
24cabdff1aSopenharmony_ci#include "hevcdsp_lsx.h"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_cistatic const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
27cabdff1aSopenharmony_ci    /* 8 width cases */
28cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29cabdff1aSopenharmony_ci    /* 4 width cases */
30cabdff1aSopenharmony_ci    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31cabdff1aSopenharmony_ci    /* 4 width cases */
32cabdff1aSopenharmony_ci    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33cabdff1aSopenharmony_ci};
34cabdff1aSopenharmony_ci
35cabdff1aSopenharmony_cistatic av_always_inline
36cabdff1aSopenharmony_civoid common_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
37cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride,
38cabdff1aSopenharmony_ci                          const int8_t *filter, int32_t height)
39cabdff1aSopenharmony_ci{
40cabdff1aSopenharmony_ci    int32_t loop_cnt;
41cabdff1aSopenharmony_ci    __m128i mask0, mask1, mask2, mask3, out1, out2;
42cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
43cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
44cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
45cabdff1aSopenharmony_ci    __m128i res0, res1, res2, res3;
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
48cabdff1aSopenharmony_ci    src -= 3;
49cabdff1aSopenharmony_ci
50cabdff1aSopenharmony_ci    /* rearranging filter */
51cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
52cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
55cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
56cabdff1aSopenharmony_ci
57cabdff1aSopenharmony_ci    for (loop_cnt = height; loop_cnt--;) {
58cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vld, src, 0, src, 8, src, 16, src, 24,
59cabdff1aSopenharmony_ci                  src0, src1, src2, src3);
60cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vld, src, 32, src, 40, src, 48, src, 56,
61cabdff1aSopenharmony_ci                  src4, src5, src6, src7);
62cabdff1aSopenharmony_ci        src += src_stride;
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
65cabdff1aSopenharmony_ci                  vec0, vec1);
66cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
67cabdff1aSopenharmony_ci                  vec2, vec3);
68cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
69cabdff1aSopenharmony_ci                  vec3, filt0, res0, res1, res2, res3);
70cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
71cabdff1aSopenharmony_ci                  vec0, vec1);
72cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
73cabdff1aSopenharmony_ci                  vec2, vec3);
74cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
75cabdff1aSopenharmony_ci                  res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
76cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
77cabdff1aSopenharmony_ci                  vec4, vec5);
78cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
79cabdff1aSopenharmony_ci                  vec6, vec7);
80cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
81cabdff1aSopenharmony_ci                  res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
82cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
83cabdff1aSopenharmony_ci                  vec4, vec5);
84cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
85cabdff1aSopenharmony_ci                  vec6, vec7);
86cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
87cabdff1aSopenharmony_ci                  res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
90cabdff1aSopenharmony_ci                  out1, out2);
91cabdff1aSopenharmony_ci        __lsx_vst(out1, dst, 0);
92cabdff1aSopenharmony_ci        __lsx_vst(out2, dst, 16);
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0,
95cabdff1aSopenharmony_ci                  vec0, vec1);
96cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0,
97cabdff1aSopenharmony_ci                  vec2, vec3);
98cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
99cabdff1aSopenharmony_ci                  vec3, filt0, res0, res1, res2, res3);
100cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2,
101cabdff1aSopenharmony_ci                  vec0, vec1);
102cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2,
103cabdff1aSopenharmony_ci                  vec2, vec3);
104cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
105cabdff1aSopenharmony_ci                  res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
106cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1,
107cabdff1aSopenharmony_ci                  vec4, vec5);
108cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1,
109cabdff1aSopenharmony_ci                  vec6, vec7);
110cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
111cabdff1aSopenharmony_ci                  res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
112cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3,
113cabdff1aSopenharmony_ci                  vec4, vec5);
114cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3,
115cabdff1aSopenharmony_ci                  vec6, vec7);
116cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
117cabdff1aSopenharmony_ci                  res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
118cabdff1aSopenharmony_ci
119cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
120cabdff1aSopenharmony_ci                  out1, out2);
121cabdff1aSopenharmony_ci        __lsx_vst(out1, dst, 32);
122cabdff1aSopenharmony_ci        __lsx_vst(out2, dst, 48);
123cabdff1aSopenharmony_ci        dst += dst_stride;
124cabdff1aSopenharmony_ci    }
125cabdff1aSopenharmony_ci}
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_cistatic av_always_inline
128cabdff1aSopenharmony_civoid common_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
129cabdff1aSopenharmony_ci                         uint8_t *dst, int32_t dst_stride,
130cabdff1aSopenharmony_ci                         const int8_t *filter, int32_t height)
131cabdff1aSopenharmony_ci{
132cabdff1aSopenharmony_ci    uint32_t loop_cnt;
133cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
134cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
135cabdff1aSopenharmony_ci    int32_t src_stride_4x = (src_stride << 2);
136cabdff1aSopenharmony_ci    int32_t dst_stride_4x = (dst_stride << 2);
137cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
138cabdff1aSopenharmony_ci    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
139cabdff1aSopenharmony_ci
140cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
141cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
142cabdff1aSopenharmony_ci    __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
143cabdff1aSopenharmony_ci    __m128i tmp0, tmp1;
144cabdff1aSopenharmony_ci    __m128i out0_r, out1_r, out2_r, out3_r;
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci    src -= src_stride_3x;
147cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
148cabdff1aSopenharmony_ci              filt0, filt1, filt2, filt3);
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
151cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
152cabdff1aSopenharmony_ci    src3 = __lsx_vldx(src, src_stride_3x);
153cabdff1aSopenharmony_ci    src += src_stride_4x;
154cabdff1aSopenharmony_ci    src4 = __lsx_vld(src, 0);
155cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
156cabdff1aSopenharmony_ci    src += src_stride_3x;
157cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
158cabdff1aSopenharmony_ci              src10_r, src32_r, src54_r, src21_r);
159cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 2); loop_cnt--;) {
162cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
163cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
164cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src, src_stride_3x);
165cabdff1aSopenharmony_ci        src += src_stride_4x;
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
168cabdff1aSopenharmony_ci                  src9, src76_r, src87_r, src98_r, src109_r);
169cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
170cabdff1aSopenharmony_ci                  filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
171cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
172cabdff1aSopenharmony_ci                  src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
173cabdff1aSopenharmony_ci                  filt1, out0_r, out1_r, out2_r, out3_r);
174cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
175cabdff1aSopenharmony_ci                  src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
176cabdff1aSopenharmony_ci                  filt2, out0_r, out1_r, out2_r, out3_r);
177cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
178cabdff1aSopenharmony_ci                  src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
179cabdff1aSopenharmony_ci                  filt3, out0_r, out1_r, out2_r, out3_r);
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
182cabdff1aSopenharmony_ci                  tmp0, tmp1)
183cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp0, dst, 0, 0);
184cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
185cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0);
186cabdff1aSopenharmony_ci        __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1);
187cabdff1aSopenharmony_ci        dst += dst_stride_4x;
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci        src10_r = src54_r;
190cabdff1aSopenharmony_ci        src32_r = src76_r;
191cabdff1aSopenharmony_ci        src54_r = src98_r;
192cabdff1aSopenharmony_ci        src21_r = src65_r;
193cabdff1aSopenharmony_ci        src43_r = src87_r;
194cabdff1aSopenharmony_ci        src65_r = src109_r;
195cabdff1aSopenharmony_ci        src6 = src10;
196cabdff1aSopenharmony_ci    }
197cabdff1aSopenharmony_ci}
198cabdff1aSopenharmony_ci
199cabdff1aSopenharmony_cistatic av_always_inline
200cabdff1aSopenharmony_civoid common_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
201cabdff1aSopenharmony_ci                          int32_t dst_stride, const int8_t *filter,
202cabdff1aSopenharmony_ci                          int32_t height, int32_t width)
203cabdff1aSopenharmony_ci{
204cabdff1aSopenharmony_ci    uint8_t *src_tmp;
205cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
206cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
207cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
208cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
209cabdff1aSopenharmony_ci    const int32_t src_stride_4x = (src_stride << 2);
210cabdff1aSopenharmony_ci    const int32_t dst_stride_4x = (dst_stride << 2);
211cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
212cabdff1aSopenharmony_ci    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
215cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
216cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
217cabdff1aSopenharmony_ci    __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
218cabdff1aSopenharmony_ci    __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
219cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3;
220cabdff1aSopenharmony_ci    __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
221cabdff1aSopenharmony_ci
222cabdff1aSopenharmony_ci    src -= src_stride_3x;
223cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, filt0,
224cabdff1aSopenharmony_ci              filt1, filt2, filt3);
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci    for (cnt = (width >> 4); cnt--;) {
227cabdff1aSopenharmony_ci        src_tmp = src;
228cabdff1aSopenharmony_ci        dst_tmp = dst;
229cabdff1aSopenharmony_ci
230cabdff1aSopenharmony_ci        src0 = __lsx_vld(src_tmp, 0);
231cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
232cabdff1aSopenharmony_ci                  src1, src2);
233cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src_tmp, src_stride_3x);
234cabdff1aSopenharmony_ci        src_tmp += src_stride_4x;
235cabdff1aSopenharmony_ci        src4 = __lsx_vld(src_tmp, 0);
236cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
237cabdff1aSopenharmony_ci                  src5, src6);
238cabdff1aSopenharmony_ci        src_tmp += src_stride_3x;
239cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
240cabdff1aSopenharmony_ci                  src10_r, src32_r, src54_r, src21_r);
241cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
242cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
243cabdff1aSopenharmony_ci                  src10_l, src32_l, src54_l, src21_l);
244cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
245cabdff1aSopenharmony_ci
246cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
247cabdff1aSopenharmony_ci            src7 = __lsx_vld(src_tmp, 0);
248cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
249cabdff1aSopenharmony_ci                      src8, src9);
250cabdff1aSopenharmony_ci            src10 = __lsx_vldx(src_tmp, src_stride_3x);
251cabdff1aSopenharmony_ci            src_tmp += src_stride_4x;
252cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
253cabdff1aSopenharmony_ci                      src9, src76_r, src87_r, src98_r, src109_r);
254cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
255cabdff1aSopenharmony_ci                      src9, src76_l, src87_l, src98_l, src109_l);
256cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
257cabdff1aSopenharmony_ci                      filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
258cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
259cabdff1aSopenharmony_ci                      src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
260cabdff1aSopenharmony_ci                      filt1, out0_r, out1_r, out2_r, out3_r);
261cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
262cabdff1aSopenharmony_ci                      src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
263cabdff1aSopenharmony_ci                      filt2, out0_r, out1_r, out2_r, out3_r);
264cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
265cabdff1aSopenharmony_ci                      src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
266cabdff1aSopenharmony_ci                      filt3, out0_r, out1_r, out2_r, out3_r);
267cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l,
268cabdff1aSopenharmony_ci                      filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l);
269cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l,
270cabdff1aSopenharmony_ci                      src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l,
271cabdff1aSopenharmony_ci                      filt1, out0_l, out1_l, out2_l, out3_l);
272cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l,
273cabdff1aSopenharmony_ci                      src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l,
274cabdff1aSopenharmony_ci                      filt2, out0_l, out1_l, out2_l, out3_l);
275cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l,
276cabdff1aSopenharmony_ci                      src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l,
277cabdff1aSopenharmony_ci                      filt3, out0_l, out1_l, out2_l, out3_l);
278cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r,
279cabdff1aSopenharmony_ci                      6, out2_l, out2_r, 6, out3_l, out3_r, 6,
280cabdff1aSopenharmony_ci                      tmp0, tmp1, tmp2, tmp3);
281cabdff1aSopenharmony_ci            __lsx_vst(tmp0, dst_tmp, 0);
282cabdff1aSopenharmony_ci            __lsx_vstx(tmp1, dst_tmp, dst_stride);
283cabdff1aSopenharmony_ci            __lsx_vstx(tmp2, dst_tmp, dst_stride_2x);
284cabdff1aSopenharmony_ci            __lsx_vstx(tmp3, dst_tmp, dst_stride_3x);
285cabdff1aSopenharmony_ci            dst_tmp += dst_stride_4x;
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci            src10_r = src54_r;
288cabdff1aSopenharmony_ci            src32_r = src76_r;
289cabdff1aSopenharmony_ci            src54_r = src98_r;
290cabdff1aSopenharmony_ci            src21_r = src65_r;
291cabdff1aSopenharmony_ci            src43_r = src87_r;
292cabdff1aSopenharmony_ci            src65_r = src109_r;
293cabdff1aSopenharmony_ci            src10_l = src54_l;
294cabdff1aSopenharmony_ci            src32_l = src76_l;
295cabdff1aSopenharmony_ci            src54_l = src98_l;
296cabdff1aSopenharmony_ci            src21_l = src65_l;
297cabdff1aSopenharmony_ci            src43_l = src87_l;
298cabdff1aSopenharmony_ci            src65_l = src109_l;
299cabdff1aSopenharmony_ci            src6 = src10;
300cabdff1aSopenharmony_ci        }
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_ci        src += 16;
303cabdff1aSopenharmony_ci        dst += 16;
304cabdff1aSopenharmony_ci    }
305cabdff1aSopenharmony_ci}
306cabdff1aSopenharmony_ci
307cabdff1aSopenharmony_cistatic void common_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
308cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
309cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
310cabdff1aSopenharmony_ci{
311cabdff1aSopenharmony_ci    common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 16);
312cabdff1aSopenharmony_ci    common_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, filter,
313cabdff1aSopenharmony_ci                        height);
314cabdff1aSopenharmony_ci}
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_cistatic void common_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
317cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
318cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
319cabdff1aSopenharmony_ci{
320cabdff1aSopenharmony_ci    common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
321cabdff1aSopenharmony_ci}
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_cistatic void common_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
324cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
325cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
326cabdff1aSopenharmony_ci{
327cabdff1aSopenharmony_ci    common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 48);
328cabdff1aSopenharmony_ci}
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_cistatic void common_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
331cabdff1aSopenharmony_ci                                 uint8_t *dst, int32_t dst_stride,
332cabdff1aSopenharmony_ci                                 const int8_t *filter, int32_t height)
333cabdff1aSopenharmony_ci{
334cabdff1aSopenharmony_ci    common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 64);
335cabdff1aSopenharmony_ci}
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_cistatic av_always_inline
338cabdff1aSopenharmony_civoid hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
339cabdff1aSopenharmony_ci                        int32_t dst_stride, const int8_t *filter_x,
340cabdff1aSopenharmony_ci                        const int8_t *filter_y, int32_t height, int32_t width)
341cabdff1aSopenharmony_ci{
342cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
343cabdff1aSopenharmony_ci    uint8_t *src_tmp;
344cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
345cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
346cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
347cabdff1aSopenharmony_ci    const int32_t src_stride_4x = (src_stride << 2);
348cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
349cabdff1aSopenharmony_ci
350cabdff1aSopenharmony_ci    __m128i out;
351cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
352cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt2, filt3;
353cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
354cabdff1aSopenharmony_ci    __m128i mask1, mask2, mask3;
355cabdff1aSopenharmony_ci    __m128i filter_vec;
356cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
357cabdff1aSopenharmony_ci    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
358cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
359cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
360cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
361cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
362cabdff1aSopenharmony_ci    __m128i dst21_r, dst43_r, dst65_r, dst87_r;
363cabdff1aSopenharmony_ci    __m128i dst21_l, dst43_l, dst65_l, dst87_l;
364cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci    src -= (src_stride_3x + 3);
367cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
368cabdff1aSopenharmony_ci              filter_x, 6, filt0, filt1, filt2, filt3);
369cabdff1aSopenharmony_ci
370cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
371cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
372cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
373cabdff1aSopenharmony_ci              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
376cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask0, 6);
377cabdff1aSopenharmony_ci
378cabdff1aSopenharmony_ci    for (cnt = width >> 3; cnt--;) {
379cabdff1aSopenharmony_ci        src_tmp = src;
380cabdff1aSopenharmony_ci        dst_tmp = dst;
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci        src0 = __lsx_vld(src_tmp, 0);
383cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
384cabdff1aSopenharmony_ci                  src1, src2);
385cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src_tmp, src_stride_3x);
386cabdff1aSopenharmony_ci        src_tmp += src_stride_4x;
387cabdff1aSopenharmony_ci        src4 = __lsx_vld(src_tmp, 0);
388cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
389cabdff1aSopenharmony_ci                  src5, src6);
390cabdff1aSopenharmony_ci        src_tmp += src_stride_3x;
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ci        /* row 0 row 1 row 2 row 3 */
393cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
394cabdff1aSopenharmony_ci                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
395cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
396cabdff1aSopenharmony_ci                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
397cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
398cabdff1aSopenharmony_ci                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
399cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
400cabdff1aSopenharmony_ci                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
401cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
402cabdff1aSopenharmony_ci                  vec12, filt0, dst0, dst1, dst2, dst3);
403cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
404cabdff1aSopenharmony_ci                  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
405cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
406cabdff1aSopenharmony_ci                  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
407cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
408cabdff1aSopenharmony_ci                  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
409cabdff1aSopenharmony_ci
410cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
411cabdff1aSopenharmony_ci                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
412cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
413cabdff1aSopenharmony_ci                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
414cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
415cabdff1aSopenharmony_ci                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
416cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
417cabdff1aSopenharmony_ci        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
418cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
419cabdff1aSopenharmony_ci                  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
420cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
421cabdff1aSopenharmony_ci                  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
422cabdff1aSopenharmony_ci        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
423cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
424cabdff1aSopenharmony_ci                  dst1, dst10_r, dst32_r, dst54_r, dst21_r);
425cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
426cabdff1aSopenharmony_ci                  dst1, dst10_l, dst32_l, dst54_l, dst21_l);
427cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
428cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
429cabdff1aSopenharmony_ci
430cabdff1aSopenharmony_ci        for (loop_cnt = height >> 1; loop_cnt--;) {
431cabdff1aSopenharmony_ci            src7 = __lsx_vld(src_tmp, 0);
432cabdff1aSopenharmony_ci            src8 = __lsx_vldx(src_tmp, src_stride);
433cabdff1aSopenharmony_ci            src_tmp += src_stride_2x;
434cabdff1aSopenharmony_ci
435cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
436cabdff1aSopenharmony_ci                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
437cabdff1aSopenharmony_ci            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
438cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
439cabdff1aSopenharmony_ci                      filt2, dst7, dst7);
440cabdff1aSopenharmony_ci            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
441cabdff1aSopenharmony_ci            dst76_r = __lsx_vilvl_h(dst7, dst6);
442cabdff1aSopenharmony_ci            dst76_l = __lsx_vilvh_h(dst7, dst6);
443cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
444cabdff1aSopenharmony_ci                      dst0_r, dst0_l);
445cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
446cabdff1aSopenharmony_ci                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
447cabdff1aSopenharmony_ci                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
448cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
449cabdff1aSopenharmony_ci                      dst76_l, filt_h3, dst0_r, dst0_l);
450cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
453cabdff1aSopenharmony_ci                      src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
454cabdff1aSopenharmony_ci            dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
455cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
456cabdff1aSopenharmony_ci                      filt2, dst8, dst8);
457cabdff1aSopenharmony_ci            dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
458cabdff1aSopenharmony_ci
459cabdff1aSopenharmony_ci            dst87_r = __lsx_vilvl_h(dst8, dst7);
460cabdff1aSopenharmony_ci            dst87_l = __lsx_vilvh_h(dst8, dst7);
461cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
462cabdff1aSopenharmony_ci                      dst1_r, dst1_l);
463cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
464cabdff1aSopenharmony_ci                      dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
465cabdff1aSopenharmony_ci                      dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
466cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
467cabdff1aSopenharmony_ci                      dst87_l, filt_h3, dst1_r, dst1_l);
468cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
469cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l,
470cabdff1aSopenharmony_ci                      6, dst0_r, dst0_l, dst1_r, dst1_l);
471cabdff1aSopenharmony_ci            DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r,
472cabdff1aSopenharmony_ci                      dst0_l, dst0_r, dst1_l, dst1_r);
473cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
474cabdff1aSopenharmony_ci                      dst0, dst1);
475cabdff1aSopenharmony_ci            out = __lsx_vpickev_b(dst1, dst0);
476cabdff1aSopenharmony_ci            __lsx_vstelm_d(out, dst_tmp, 0, 0);
477cabdff1aSopenharmony_ci            __lsx_vstelm_d(out, dst_tmp + dst_stride, 0, 1);
478cabdff1aSopenharmony_ci            dst_tmp += dst_stride_2x;
479cabdff1aSopenharmony_ci
480cabdff1aSopenharmony_ci            dst10_r = dst32_r;
481cabdff1aSopenharmony_ci            dst32_r = dst54_r;
482cabdff1aSopenharmony_ci            dst54_r = dst76_r;
483cabdff1aSopenharmony_ci            dst10_l = dst32_l;
484cabdff1aSopenharmony_ci            dst32_l = dst54_l;
485cabdff1aSopenharmony_ci            dst54_l = dst76_l;
486cabdff1aSopenharmony_ci            dst21_r = dst43_r;
487cabdff1aSopenharmony_ci            dst43_r = dst65_r;
488cabdff1aSopenharmony_ci            dst65_r = dst87_r;
489cabdff1aSopenharmony_ci            dst21_l = dst43_l;
490cabdff1aSopenharmony_ci            dst43_l = dst65_l;
491cabdff1aSopenharmony_ci            dst65_l = dst87_l;
492cabdff1aSopenharmony_ci            dst6 = dst8;
493cabdff1aSopenharmony_ci        }
494cabdff1aSopenharmony_ci        src += 8;
495cabdff1aSopenharmony_ci        dst += 8;
496cabdff1aSopenharmony_ci    }
497cabdff1aSopenharmony_ci}
498cabdff1aSopenharmony_ci
499cabdff1aSopenharmony_cistatic void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
500cabdff1aSopenharmony_ci                              int32_t dst_stride, const int8_t *filter_x,
501cabdff1aSopenharmony_ci                              const int8_t *filter_y, int32_t height)
502cabdff1aSopenharmony_ci{
503cabdff1aSopenharmony_ci    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
504cabdff1aSopenharmony_ci                       filter_x, filter_y, height, 8);
505cabdff1aSopenharmony_ci}
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_cistatic void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
508cabdff1aSopenharmony_ci                               int32_t dst_stride, const int8_t *filter_x,
509cabdff1aSopenharmony_ci                               const int8_t *filter_y, int32_t height)
510cabdff1aSopenharmony_ci{
511cabdff1aSopenharmony_ci    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
512cabdff1aSopenharmony_ci                      filter_x, filter_y, height, 16);
513cabdff1aSopenharmony_ci}
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_cistatic void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
516cabdff1aSopenharmony_ci                               int32_t dst_stride, const int8_t *filter_x,
517cabdff1aSopenharmony_ci                               const int8_t *filter_y, int32_t height)
518cabdff1aSopenharmony_ci{
519cabdff1aSopenharmony_ci    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
520cabdff1aSopenharmony_ci                       filter_x, filter_y, height, 24);
521cabdff1aSopenharmony_ci}
522cabdff1aSopenharmony_ci
523cabdff1aSopenharmony_cistatic void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
524cabdff1aSopenharmony_ci                               int32_t dst_stride, const int8_t *filter_x,
525cabdff1aSopenharmony_ci                               const int8_t *filter_y, int32_t height)
526cabdff1aSopenharmony_ci{
527cabdff1aSopenharmony_ci    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
528cabdff1aSopenharmony_ci                       filter_x, filter_y, height, 32);
529cabdff1aSopenharmony_ci}
530cabdff1aSopenharmony_ci
531cabdff1aSopenharmony_cistatic void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
532cabdff1aSopenharmony_ci                               int32_t dst_stride, const int8_t *filter_x,
533cabdff1aSopenharmony_ci                               const int8_t *filter_y, int32_t height)
534cabdff1aSopenharmony_ci{
535cabdff1aSopenharmony_ci    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
536cabdff1aSopenharmony_ci                       filter_x, filter_y, height, 48);
537cabdff1aSopenharmony_ci}
538cabdff1aSopenharmony_ci
539cabdff1aSopenharmony_cistatic void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
540cabdff1aSopenharmony_ci                               int32_t dst_stride, const int8_t *filter_x,
541cabdff1aSopenharmony_ci                               const int8_t *filter_y, int32_t height)
542cabdff1aSopenharmony_ci{
543cabdff1aSopenharmony_ci    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
544cabdff1aSopenharmony_ci                       filter_x, filter_y, height, 64);
545cabdff1aSopenharmony_ci}
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_cistatic av_always_inline
548cabdff1aSopenharmony_civoid common_vt_4t_24w_lsx(uint8_t *src, int32_t src_stride,
549cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride,
550cabdff1aSopenharmony_ci                          const int8_t *filter, int32_t height)
551cabdff1aSopenharmony_ci{
552cabdff1aSopenharmony_ci    uint32_t loop_cnt;
553cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
554cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
555cabdff1aSopenharmony_ci    uint8_t *_src;
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
558cabdff1aSopenharmony_ci    __m128i src11, filt0, filt1;
559cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
560cabdff1aSopenharmony_ci    __m128i src109_r, src10_l, src32_l, src21_l, src43_l;
561cabdff1aSopenharmony_ci    __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
562cabdff1aSopenharmony_ci    __m128i out1, out2, out3, out4;
563cabdff1aSopenharmony_ci
564cabdff1aSopenharmony_ci    src -= src_stride;
565cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
566cabdff1aSopenharmony_ci    _src = src + 16;
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci    /* 16 width */
569cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
570cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
571cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
572cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_ci    /* 8 width */
575cabdff1aSopenharmony_ci    src6 = __lsx_vld(_src, 0);
576cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
577cabdff1aSopenharmony_ci    src += src_stride_3x;
578cabdff1aSopenharmony_ci    _src += src_stride_3x;
579cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
582cabdff1aSopenharmony_ci        /* 16 width */
583cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
584cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
585cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
586cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_ci        /* 8 width */
589cabdff1aSopenharmony_ci        src += src_stride_2x;
590cabdff1aSopenharmony_ci        _src += src_stride_2x;
591cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
592cabdff1aSopenharmony_ci
593cabdff1aSopenharmony_ci        /* 16 width */
594cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
595cabdff1aSopenharmony_ci                  filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
596cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
597cabdff1aSopenharmony_ci                  filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
598cabdff1aSopenharmony_ci                  out0_r, out0_l, out1_r, out1_l);
599cabdff1aSopenharmony_ci
600cabdff1aSopenharmony_ci        /* 8 width */
601cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
602cabdff1aSopenharmony_ci                  out2_r, out3_r);
603cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r,
604cabdff1aSopenharmony_ci                  src109_r, filt1, out2_r, out3_r);
605cabdff1aSopenharmony_ci
606cabdff1aSopenharmony_ci        /* 16 + 8 width */
607cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
608cabdff1aSopenharmony_ci                out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4);
609cabdff1aSopenharmony_ci        __lsx_vst(out1, dst, 0);
610cabdff1aSopenharmony_ci        __lsx_vstelm_d(out2, dst, 16, 0);
611cabdff1aSopenharmony_ci        dst += dst_stride;
612cabdff1aSopenharmony_ci        __lsx_vst(out4, dst, 0);
613cabdff1aSopenharmony_ci        __lsx_vstelm_d(out3, dst, 16, 0);
614cabdff1aSopenharmony_ci        dst += dst_stride;
615cabdff1aSopenharmony_ci
616cabdff1aSopenharmony_ci        /* 16 width */
617cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
618cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
619cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
620cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
621cabdff1aSopenharmony_ci
622cabdff1aSopenharmony_ci        /* 8 width */
623cabdff1aSopenharmony_ci        src += src_stride_2x;
624cabdff1aSopenharmony_ci        _src += src_stride_2x;
625cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
626cabdff1aSopenharmony_ci
627cabdff1aSopenharmony_ci        /* 16 width */
628cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
629cabdff1aSopenharmony_ci                  filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l);
630cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l,
631cabdff1aSopenharmony_ci                  filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1,
632cabdff1aSopenharmony_ci                  out0_r, out0_l, out1_r, out1_l);
633cabdff1aSopenharmony_ci
634cabdff1aSopenharmony_ci        /* 8 width */
635cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
636cabdff1aSopenharmony_ci                  out2_r, out3_r);
637cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r,
638cabdff1aSopenharmony_ci                  src87_r, filt1, out2_r, out3_r);
639cabdff1aSopenharmony_ci
640cabdff1aSopenharmony_ci        /* 16 + 8 width */
641cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
642cabdff1aSopenharmony_ci                  out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4);
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci        __lsx_vst(out1, dst, 0);
645cabdff1aSopenharmony_ci        __lsx_vstelm_d(out2, dst, 16, 0);
646cabdff1aSopenharmony_ci        dst += dst_stride;
647cabdff1aSopenharmony_ci        __lsx_vst(out3, dst, 0);
648cabdff1aSopenharmony_ci        __lsx_vstelm_d(out4, dst, 16, 0);
649cabdff1aSopenharmony_ci        dst += dst_stride;
650cabdff1aSopenharmony_ci    }
651cabdff1aSopenharmony_ci}
652cabdff1aSopenharmony_ci
653cabdff1aSopenharmony_cistatic av_always_inline
654cabdff1aSopenharmony_civoid common_vt_4t_32w_lsx(uint8_t *src, int32_t src_stride,
655cabdff1aSopenharmony_ci                          uint8_t *dst, int32_t dst_stride,
656cabdff1aSopenharmony_ci                          const int8_t *filter, int32_t height)
657cabdff1aSopenharmony_ci{
658cabdff1aSopenharmony_ci    uint32_t loop_cnt;
659cabdff1aSopenharmony_ci    int32_t src_stride_2x = (src_stride << 1);
660cabdff1aSopenharmony_ci    int32_t dst_stride_2x = (dst_stride << 1);
661cabdff1aSopenharmony_ci    int32_t src_stride_3x = src_stride_2x + src_stride;
662cabdff1aSopenharmony_ci    uint8_t *_src;
663cabdff1aSopenharmony_ci
664cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
665cabdff1aSopenharmony_ci    __m128i src10_r, src32_r, src76_r, src98_r;
666cabdff1aSopenharmony_ci    __m128i src21_r, src43_r, src87_r, src109_r;
667cabdff1aSopenharmony_ci    __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
668cabdff1aSopenharmony_ci    __m128i src10_l, src32_l, src76_l, src98_l;
669cabdff1aSopenharmony_ci    __m128i src21_l, src43_l, src87_l, src109_l;
670cabdff1aSopenharmony_ci    __m128i filt0, filt1;
671cabdff1aSopenharmony_ci    __m128i out1, out2;
672cabdff1aSopenharmony_ci
673cabdff1aSopenharmony_ci    src -= src_stride;
674cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
675cabdff1aSopenharmony_ci    _src = src + 16;
676cabdff1aSopenharmony_ci
677cabdff1aSopenharmony_ci    /* 16 width */
678cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
679cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
680cabdff1aSopenharmony_ci
681cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
682cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
683cabdff1aSopenharmony_ci
684cabdff1aSopenharmony_ci    /* next 16 width */
685cabdff1aSopenharmony_ci    src6 = __lsx_vld(_src, 0);
686cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
687cabdff1aSopenharmony_ci    src += src_stride_3x;
688cabdff1aSopenharmony_ci    _src += src_stride_3x;
689cabdff1aSopenharmony_ci
690cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
691cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
692cabdff1aSopenharmony_ci
693cabdff1aSopenharmony_ci    for (loop_cnt = (height >> 1); loop_cnt--;) {
694cabdff1aSopenharmony_ci        /* 16 width */
695cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
696cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
697cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
698cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
699cabdff1aSopenharmony_ci
700cabdff1aSopenharmony_ci        /* 16 width */
701cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
702cabdff1aSopenharmony_ci                  filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
703cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
704cabdff1aSopenharmony_ci                  filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
705cabdff1aSopenharmony_ci                  out0_r, out0_l, out1_r, out1_l);
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6,
708cabdff1aSopenharmony_ci                  out1, out2);
709cabdff1aSopenharmony_ci        __lsx_vst(out1, dst, 0);
710cabdff1aSopenharmony_ci        __lsx_vstx(out2, dst, dst_stride);
711cabdff1aSopenharmony_ci
712cabdff1aSopenharmony_ci        src10_r = src32_r;
713cabdff1aSopenharmony_ci        src21_r = src43_r;
714cabdff1aSopenharmony_ci        src10_l = src32_l;
715cabdff1aSopenharmony_ci        src21_l = src43_l;
716cabdff1aSopenharmony_ci        src2 = src4;
717cabdff1aSopenharmony_ci
718cabdff1aSopenharmony_ci        /* next 16 width */
719cabdff1aSopenharmony_ci        src += src_stride_2x;
720cabdff1aSopenharmony_ci        _src += src_stride_2x;
721cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
722cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci        /* next 16 width */
725cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
726cabdff1aSopenharmony_ci                  filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l);
727cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l,
728cabdff1aSopenharmony_ci                  filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1,
729cabdff1aSopenharmony_ci                  out2_r, out2_l, out3_r, out3_l);
730cabdff1aSopenharmony_ci
731cabdff1aSopenharmony_ci        /* next 16 width */
732cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6,
733cabdff1aSopenharmony_ci                  out1, out2);
734cabdff1aSopenharmony_ci        __lsx_vst(out1, dst, 16);
735cabdff1aSopenharmony_ci        __lsx_vst(out2, dst + dst_stride, 16);
736cabdff1aSopenharmony_ci
737cabdff1aSopenharmony_ci        dst += dst_stride_2x;
738cabdff1aSopenharmony_ci
739cabdff1aSopenharmony_ci        src76_r = src98_r;
740cabdff1aSopenharmony_ci        src87_r = src109_r;
741cabdff1aSopenharmony_ci        src76_l = src98_l;
742cabdff1aSopenharmony_ci        src87_l = src109_l;
743cabdff1aSopenharmony_ci        src8 = src10;
744cabdff1aSopenharmony_ci    }
745cabdff1aSopenharmony_ci}
746cabdff1aSopenharmony_ci
747cabdff1aSopenharmony_cistatic av_always_inline
748cabdff1aSopenharmony_civoid hevc_hv_4t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
749cabdff1aSopenharmony_ci                        int32_t dst_stride, const int8_t *filter_x,
750cabdff1aSopenharmony_ci                        const int8_t *filter_y)
751cabdff1aSopenharmony_ci{
752cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
753cabdff1aSopenharmony_ci    const int32_t src_stride_4x = (src_stride << 2);
754cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
755cabdff1aSopenharmony_ci    __m128i out;
756cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4;
757cabdff1aSopenharmony_ci    __m128i filt0, filt1;
758cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1, filter_vec;
759cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
760cabdff1aSopenharmony_ci    __m128i mask1;
761cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
762cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4;
763cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
764cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
765cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
766cabdff1aSopenharmony_ci    __m128i out0_r, out1_r;
767cabdff1aSopenharmony_ci
768cabdff1aSopenharmony_ci    src -= (src_stride + 1);
769cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
770cabdff1aSopenharmony_ci
771cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
772cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
773cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
774cabdff1aSopenharmony_ci
775cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
776cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
777cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
778cabdff1aSopenharmony_ci              src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
779cabdff1aSopenharmony_ci
780cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
781cabdff1aSopenharmony_ci              mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
782cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
783cabdff1aSopenharmony_ci              mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
784cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
787cabdff1aSopenharmony_ci              filt0, dst0, dst1, dst2, dst3);
788cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
789cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
790cabdff1aSopenharmony_ci              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
791cabdff1aSopenharmony_ci    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
792cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
793cabdff1aSopenharmony_ci              dst10_r, dst21_r, dst32_r, dst43_r);
794cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
795cabdff1aSopenharmony_ci              dst10_l, dst21_l, dst32_l, dst43_l);
796cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
797cabdff1aSopenharmony_ci              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
798cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
799cabdff1aSopenharmony_ci              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
800cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
801cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
802cabdff1aSopenharmony_ci              out0_r, out1_r);
803cabdff1aSopenharmony_ci    out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6);
804cabdff1aSopenharmony_ci    __lsx_vstelm_d(out, dst, 0, 0);
805cabdff1aSopenharmony_ci    __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
806cabdff1aSopenharmony_ci}
807cabdff1aSopenharmony_ci
808cabdff1aSopenharmony_cistatic av_always_inline
809cabdff1aSopenharmony_civoid hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
810cabdff1aSopenharmony_ci                            int32_t dst_stride, const int8_t *filter_x,
811cabdff1aSopenharmony_ci                            const int8_t *filter_y, int32_t width8mult)
812cabdff1aSopenharmony_ci{
813cabdff1aSopenharmony_ci    uint32_t cnt;
814cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
815cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
816cabdff1aSopenharmony_ci    const int32_t src_stride_4x = (src_stride << 2);
817cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
818cabdff1aSopenharmony_ci    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
819cabdff1aSopenharmony_ci
820cabdff1aSopenharmony_ci    __m128i out0, out1;
821cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
822cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
823cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
824cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
825cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
826cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
827cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
828cabdff1aSopenharmony_ci
829cabdff1aSopenharmony_ci    src -= (src_stride + 1);
830cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
831cabdff1aSopenharmony_ci
832cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
833cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
834cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
835cabdff1aSopenharmony_ci
836cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
837cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
838cabdff1aSopenharmony_ci
839cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
840cabdff1aSopenharmony_ci        src0 = __lsx_vld(src, 0);
841cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
842cabdff1aSopenharmony_ci        src3 = __lsx_vldx(src, src_stride_3x);
843cabdff1aSopenharmony_ci        src += src_stride_4x;
844cabdff1aSopenharmony_ci        src4 = __lsx_vld(src, 0);
845cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
846cabdff1aSopenharmony_ci        src += (8 - src_stride_4x);
847cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
848cabdff1aSopenharmony_ci                  vec0, vec1);
849cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
850cabdff1aSopenharmony_ci                  vec2, vec3);
851cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
852cabdff1aSopenharmony_ci                  vec4, vec5);
853cabdff1aSopenharmony_ci
854cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
855cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
856cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
857cabdff1aSopenharmony_ci                  dst0, dst1);
858cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
859cabdff1aSopenharmony_ci
860cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
861cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
862cabdff1aSopenharmony_ci
863cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
864cabdff1aSopenharmony_ci                  vec0, vec1);
865cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
866cabdff1aSopenharmony_ci                  vec2, vec3);
867cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
868cabdff1aSopenharmony_ci                  vec4, vec5);
869cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
870cabdff1aSopenharmony_ci                  vec6, vec7);
871cabdff1aSopenharmony_ci
872cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
873cabdff1aSopenharmony_ci                  vec6, filt0, dst3, dst4, dst5, dst6);
874cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
875cabdff1aSopenharmony_ci                  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
876cabdff1aSopenharmony_ci
877cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
878cabdff1aSopenharmony_ci                  dst5, dst32_r, dst43_r, dst54_r, dst65_r);
879cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
880cabdff1aSopenharmony_ci                  dst5, dst32_l, dst43_l, dst54_l, dst65_l);
881cabdff1aSopenharmony_ci
882cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
883cabdff1aSopenharmony_ci                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
884cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
885cabdff1aSopenharmony_ci                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
886cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
887cabdff1aSopenharmony_ci                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
888cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
889cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
890cabdff1aSopenharmony_ci                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
891cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
892cabdff1aSopenharmony_ci
893cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
894cabdff1aSopenharmony_ci                  dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
895cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
896cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst, 0, 0);
897cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
898cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
899cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
900cabdff1aSopenharmony_ci        dst += 8;
901cabdff1aSopenharmony_ci    }
902cabdff1aSopenharmony_ci}
903cabdff1aSopenharmony_ci
904cabdff1aSopenharmony_cistatic av_always_inline
905cabdff1aSopenharmony_civoid hevc_hv_4t_8x6_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
906cabdff1aSopenharmony_ci                        int32_t dst_stride, const int8_t *filter_x,
907cabdff1aSopenharmony_ci                        const int8_t *filter_y)
908cabdff1aSopenharmony_ci{
909cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
910cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
911cabdff1aSopenharmony_ci    const int32_t src_stride_4x = (src_stride << 2);
912cabdff1aSopenharmony_ci    const int32_t dst_stride_4x = (dst_stride << 2);
913cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
914cabdff1aSopenharmony_ci    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
915cabdff1aSopenharmony_ci    __m128i out0, out1, out2;
916cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
917cabdff1aSopenharmony_ci    __m128i filt0, filt1;
918cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1, filter_vec;
919cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
920cabdff1aSopenharmony_ci    __m128i mask1;
921cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
922cabdff1aSopenharmony_ci    __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
923cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
924cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
925cabdff1aSopenharmony_ci    __m128i dst4_r, dst4_l, dst5_r, dst5_l;
926cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst10_l, dst32_l;
927cabdff1aSopenharmony_ci    __m128i dst21_r, dst43_r, dst21_l, dst43_l;
928cabdff1aSopenharmony_ci    __m128i dst54_r, dst54_l, dst65_r, dst65_l;
929cabdff1aSopenharmony_ci    __m128i dst76_r, dst76_l, dst87_r, dst87_l;
930cabdff1aSopenharmony_ci    __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
931cabdff1aSopenharmony_ci
932cabdff1aSopenharmony_ci    src -= (src_stride + 1);
933cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
934cabdff1aSopenharmony_ci
935cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
936cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
937cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
938cabdff1aSopenharmony_ci
939cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
940cabdff1aSopenharmony_ci
941cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
942cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
943cabdff1aSopenharmony_ci              src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
944cabdff1aSopenharmony_ci    src += src_stride_4x;
945cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
946cabdff1aSopenharmony_ci              src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
947cabdff1aSopenharmony_ci
948cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
949cabdff1aSopenharmony_ci              mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
950cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
951cabdff1aSopenharmony_ci              mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
952cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
953cabdff1aSopenharmony_ci              mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
954cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
955cabdff1aSopenharmony_ci              mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
956cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
957cabdff1aSopenharmony_ci
958cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
959cabdff1aSopenharmony_ci              filt0, dst0, dst1, dst2, dst3);
960cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14,
961cabdff1aSopenharmony_ci              filt0, dst4, dst5, dst6, dst7);
962cabdff1aSopenharmony_ci    dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
963cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
964cabdff1aSopenharmony_ci              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
965cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
966cabdff1aSopenharmony_ci              vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
967cabdff1aSopenharmony_ci    dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
968cabdff1aSopenharmony_ci
969cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
970cabdff1aSopenharmony_ci              dst10_r, dst21_r, dst32_r, dst43_r);
971cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
972cabdff1aSopenharmony_ci              dst10_l, dst21_l, dst32_l, dst43_l);
973cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
974cabdff1aSopenharmony_ci              dst54_r, dst65_r, dst76_r, dst87_r);
975cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
976cabdff1aSopenharmony_ci              dst54_l, dst65_l, dst76_l, dst87_l);
977cabdff1aSopenharmony_ci
978cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
979cabdff1aSopenharmony_ci              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
980cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
981cabdff1aSopenharmony_ci              filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
982cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
983cabdff1aSopenharmony_ci              filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
984cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
985cabdff1aSopenharmony_ci              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
986cabdff1aSopenharmony_ci              dst0_r, dst0_l, dst1_r, dst1_l);
987cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
988cabdff1aSopenharmony_ci              filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
989cabdff1aSopenharmony_ci              dst2_r, dst2_l, dst3_r, dst3_l);
990cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
991cabdff1aSopenharmony_ci              filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
992cabdff1aSopenharmony_ci              dst4_r, dst4_l, dst5_r, dst5_l);
993cabdff1aSopenharmony_ci
994cabdff1aSopenharmony_ci    DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
995cabdff1aSopenharmony_ci              dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r);
996cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6,
997cabdff1aSopenharmony_ci              out4_r, out5_r);
998cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
999cabdff1aSopenharmony_ci              out0, out1);
1000cabdff1aSopenharmony_ci    out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6);
1001cabdff1aSopenharmony_ci
1002cabdff1aSopenharmony_ci    __lsx_vstelm_d(out0, dst, 0, 0);
1003cabdff1aSopenharmony_ci    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1004cabdff1aSopenharmony_ci    __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
1005cabdff1aSopenharmony_ci    __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
1006cabdff1aSopenharmony_ci    dst += dst_stride_4x;
1007cabdff1aSopenharmony_ci    __lsx_vstelm_d(out2, dst, 0, 0);
1008cabdff1aSopenharmony_ci    __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
1009cabdff1aSopenharmony_ci}
1010cabdff1aSopenharmony_ci
1011cabdff1aSopenharmony_cistatic av_always_inline
1012cabdff1aSopenharmony_civoid hevc_hv_4t_8multx4mult_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1013cabdff1aSopenharmony_ci                                int32_t dst_stride, const int8_t *filter_x,
1014cabdff1aSopenharmony_ci                                const int8_t *filter_y, int32_t height,
1015cabdff1aSopenharmony_ci                                int32_t width8mult)
1016cabdff1aSopenharmony_ci{
1017cabdff1aSopenharmony_ci    uint32_t loop_cnt, cnt;
1018cabdff1aSopenharmony_ci    uint8_t *src_tmp;
1019cabdff1aSopenharmony_ci    uint8_t *dst_tmp;
1020cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
1021cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
1022cabdff1aSopenharmony_ci    const int32_t src_stride_4x = (src_stride << 2);
1023cabdff1aSopenharmony_ci    const int32_t dst_stride_4x = (dst_stride << 2);
1024cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
1025cabdff1aSopenharmony_ci    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1026cabdff1aSopenharmony_ci
1027cabdff1aSopenharmony_ci    __m128i out0, out1;
1028cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6;
1029cabdff1aSopenharmony_ci    __m128i filt0, filt1;
1030cabdff1aSopenharmony_ci    __m128i filt_h0, filt_h1, filter_vec;
1031cabdff1aSopenharmony_ci    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1032cabdff1aSopenharmony_ci    __m128i mask1;
1033cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1034cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
1035cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1036cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
1037cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
1038cabdff1aSopenharmony_ci    __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
1039cabdff1aSopenharmony_ci    __m128i out0_r, out1_r, out2_r, out3_r;
1040cabdff1aSopenharmony_ci
1041cabdff1aSopenharmony_ci    src -= (src_stride + 1);
1042cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1043cabdff1aSopenharmony_ci
1044cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
1045cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1046cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1047cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
1048cabdff1aSopenharmony_ci
1049cabdff1aSopenharmony_ci    for (cnt = width8mult; cnt--;) {
1050cabdff1aSopenharmony_ci        src_tmp = src;
1051cabdff1aSopenharmony_ci        dst_tmp = dst;
1052cabdff1aSopenharmony_ci
1053cabdff1aSopenharmony_ci        src0 = __lsx_vld(src_tmp, 0);
1054cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1055cabdff1aSopenharmony_ci                  src1, src2);
1056cabdff1aSopenharmony_ci        src_tmp += src_stride_3x;
1057cabdff1aSopenharmony_ci
1058cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
1059cabdff1aSopenharmony_ci                  vec0, vec1);
1060cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
1061cabdff1aSopenharmony_ci                  vec2, vec3);
1062cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
1063cabdff1aSopenharmony_ci                  vec4, vec5);
1064cabdff1aSopenharmony_ci
1065cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
1066cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1067cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
1068cabdff1aSopenharmony_ci                  dst0, dst1);
1069cabdff1aSopenharmony_ci        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
1070cabdff1aSopenharmony_ci
1071cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1072cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1073cabdff1aSopenharmony_ci
1074cabdff1aSopenharmony_ci        for (loop_cnt = (height >> 2); loop_cnt--;) {
1075cabdff1aSopenharmony_ci            src3 = __lsx_vld(src_tmp, 0);
1076cabdff1aSopenharmony_ci            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1077cabdff1aSopenharmony_ci                      src4, src5);
1078cabdff1aSopenharmony_ci            src6 = __lsx_vldx(src_tmp, src_stride_3x);
1079cabdff1aSopenharmony_ci            src_tmp += src_stride_4x;
1080cabdff1aSopenharmony_ci
1081cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1082cabdff1aSopenharmony_ci                      src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1083cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1084cabdff1aSopenharmony_ci                      src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1085cabdff1aSopenharmony_ci
1086cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1087cabdff1aSopenharmony_ci                      vec6, filt0, dst3, dst4, dst5, dst6);
1088cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
1089cabdff1aSopenharmony_ci                      filt1, dst5, vec5, filt1, dst6, vec7, filt1,
1090cabdff1aSopenharmony_ci                      dst3, dst4, dst5, dst6);
1091cabdff1aSopenharmony_ci
1092cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4,
1093cabdff1aSopenharmony_ci                      dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r);
1094cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4,
1095cabdff1aSopenharmony_ci                      dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l);
1096cabdff1aSopenharmony_ci
1097cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1098cabdff1aSopenharmony_ci                      filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1099cabdff1aSopenharmony_ci            DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1100cabdff1aSopenharmony_ci                      filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1101cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
1102cabdff1aSopenharmony_ci                      dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
1103cabdff1aSopenharmony_ci                      dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
1104cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
1105cabdff1aSopenharmony_ci                      dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
1106cabdff1aSopenharmony_ci                      dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
1107cabdff1aSopenharmony_ci
1108cabdff1aSopenharmony_ci            DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
1109cabdff1aSopenharmony_ci                      dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r,
1110cabdff1aSopenharmony_ci                      out2_r, out3_r);
1111cabdff1aSopenharmony_ci            DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r,
1112cabdff1aSopenharmony_ci                      6, out0, out1);
1113cabdff1aSopenharmony_ci            __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1114cabdff1aSopenharmony_ci            __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1115cabdff1aSopenharmony_ci            __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1116cabdff1aSopenharmony_ci            __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1117cabdff1aSopenharmony_ci            dst_tmp += dst_stride_4x;
1118cabdff1aSopenharmony_ci
1119cabdff1aSopenharmony_ci            dst10_r = dst54_r;
1120cabdff1aSopenharmony_ci            dst10_l = dst54_l;
1121cabdff1aSopenharmony_ci            dst21_r = dst65_r;
1122cabdff1aSopenharmony_ci            dst21_l = dst65_l;
1123cabdff1aSopenharmony_ci            dst2 = dst6;
1124cabdff1aSopenharmony_ci        }
1125cabdff1aSopenharmony_ci        src += 8;
1126cabdff1aSopenharmony_ci        dst += 8;
1127cabdff1aSopenharmony_ci    }
1128cabdff1aSopenharmony_ci}
1129cabdff1aSopenharmony_ci
1130cabdff1aSopenharmony_cistatic
1131cabdff1aSopenharmony_civoid hevc_hv_4t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1132cabdff1aSopenharmony_ci                       int32_t dst_stride, const int8_t *filter_x,
1133cabdff1aSopenharmony_ci                       const int8_t *filter_y, int32_t height)
1134cabdff1aSopenharmony_ci{
1135cabdff1aSopenharmony_ci    if (2 == height) {
1136cabdff1aSopenharmony_ci        hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
1137cabdff1aSopenharmony_ci    } else if (4 == height) {
1138cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
1139cabdff1aSopenharmony_ci                               filter_x, filter_y, 1);
1140cabdff1aSopenharmony_ci    } else if (6 == height) {
1141cabdff1aSopenharmony_ci        hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
1142cabdff1aSopenharmony_ci    } else if (0 == (height & 0x03)) {
1143cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1144cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 1);
1145cabdff1aSopenharmony_ci    }
1146cabdff1aSopenharmony_ci}
1147cabdff1aSopenharmony_ci
1148cabdff1aSopenharmony_cistatic av_always_inline
1149cabdff1aSopenharmony_civoid hevc_hv_4t_12w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1150cabdff1aSopenharmony_ci                        int32_t dst_stride, const int8_t *filter_x,
1151cabdff1aSopenharmony_ci                        const int8_t *filter_y, int32_t height)
1152cabdff1aSopenharmony_ci{
1153cabdff1aSopenharmony_ci    uint32_t loop_cnt;
1154cabdff1aSopenharmony_ci    uint8_t *src_tmp, *dst_tmp;
1155cabdff1aSopenharmony_ci    const int32_t src_stride_2x = (src_stride << 1);
1156cabdff1aSopenharmony_ci    const int32_t dst_stride_2x = (dst_stride << 1);
1157cabdff1aSopenharmony_ci    const int32_t src_stride_4x = (src_stride << 2);
1158cabdff1aSopenharmony_ci    const int32_t dst_stride_4x = (dst_stride << 2);
1159cabdff1aSopenharmony_ci    const int32_t src_stride_3x = src_stride_2x + src_stride;
1160cabdff1aSopenharmony_ci    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1161cabdff1aSopenharmony_ci    __m128i out0, out1;
1162cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1163cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1164cabdff1aSopenharmony_ci    __m128i mask0, mask1, mask2, mask3;
1165cabdff1aSopenharmony_ci    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
1166cabdff1aSopenharmony_ci    __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
1167cabdff1aSopenharmony_ci    __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106;
1168cabdff1aSopenharmony_ci    __m128i dst76_r, dst98_r, dst87_r, dst109_r;
1169cabdff1aSopenharmony_ci    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
1170cabdff1aSopenharmony_ci    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
1171cabdff1aSopenharmony_ci    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1172cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1173cabdff1aSopenharmony_ci
1174cabdff1aSopenharmony_ci    src -= (src_stride + 1);
1175cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1176cabdff1aSopenharmony_ci
1177cabdff1aSopenharmony_ci    filter_vec = __lsx_vld(filter_y, 0);
1178cabdff1aSopenharmony_ci    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1179cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1180cabdff1aSopenharmony_ci
1181cabdff1aSopenharmony_ci    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1182cabdff1aSopenharmony_ci    mask1 = __lsx_vaddi_bu(mask0, 2);
1183cabdff1aSopenharmony_ci
1184cabdff1aSopenharmony_ci    src_tmp = src;
1185cabdff1aSopenharmony_ci    dst_tmp = dst;
1186cabdff1aSopenharmony_ci
1187cabdff1aSopenharmony_ci    src0 = __lsx_vld(src_tmp, 0);
1188cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1189cabdff1aSopenharmony_ci              src1, src2);
1190cabdff1aSopenharmony_ci    src_tmp += src_stride_3x;
1191cabdff1aSopenharmony_ci
1192cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1193cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1194cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
1197cabdff1aSopenharmony_ci    dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1198cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
1199cabdff1aSopenharmony_ci              dsth0, dsth1);
1200cabdff1aSopenharmony_ci    dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
1201cabdff1aSopenharmony_ci
1202cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
1203cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
1204cabdff1aSopenharmony_ci
1205cabdff1aSopenharmony_ci    for (loop_cnt = 4; loop_cnt--;) {
1206cabdff1aSopenharmony_ci        src3 = __lsx_vld(src_tmp, 0);
1207cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1208cabdff1aSopenharmony_ci                  src4, src5);
1209cabdff1aSopenharmony_ci        src6 = __lsx_vldx(src_tmp, src_stride_3x);
1210cabdff1aSopenharmony_ci        src_tmp += src_stride_4x;
1211cabdff1aSopenharmony_ci
1212cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1213cabdff1aSopenharmony_ci                  src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1214cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1215cabdff1aSopenharmony_ci                  src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1216cabdff1aSopenharmony_ci
1217cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1218cabdff1aSopenharmony_ci                  vec6, filt0, dsth3, dsth4, dsth5, dsth6);
1219cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
1220cabdff1aSopenharmony_ci                  vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
1221cabdff1aSopenharmony_ci                  dsth3, dsth4, dsth5, dsth6);
1222cabdff1aSopenharmony_ci
1223cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1224cabdff1aSopenharmony_ci                  dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
1225cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1226cabdff1aSopenharmony_ci                  dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
1227cabdff1aSopenharmony_ci
1228cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1229cabdff1aSopenharmony_ci                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1230cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1231cabdff1aSopenharmony_ci                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1232cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1233cabdff1aSopenharmony_ci                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1234cabdff1aSopenharmony_ci                  dst0_r, dst0_l, dst1_r, dst1_l);
1235cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1236cabdff1aSopenharmony_ci                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1237cabdff1aSopenharmony_ci                  dst2_r, dst2_l, dst3_r, dst3_l);
1238cabdff1aSopenharmony_ci
1239cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
1240cabdff1aSopenharmony_ci                  dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
1241cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1242cabdff1aSopenharmony_ci
1243cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1244cabdff1aSopenharmony_ci        __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1245cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1246cabdff1aSopenharmony_ci        __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1247cabdff1aSopenharmony_ci        dst_tmp += dst_stride_4x;
1248cabdff1aSopenharmony_ci
1249cabdff1aSopenharmony_ci        dst10_r = dst54_r;
1250cabdff1aSopenharmony_ci        dst10_l = dst54_l;
1251cabdff1aSopenharmony_ci        dst21_r = dst65_r;
1252cabdff1aSopenharmony_ci        dst21_l = dst65_l;
1253cabdff1aSopenharmony_ci        dsth2 = dsth6;
1254cabdff1aSopenharmony_ci    }
1255cabdff1aSopenharmony_ci
1256cabdff1aSopenharmony_ci    src += 8;
1257cabdff1aSopenharmony_ci    dst += 8;
1258cabdff1aSopenharmony_ci
1259cabdff1aSopenharmony_ci    mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
1260cabdff1aSopenharmony_ci    mask3 = __lsx_vaddi_bu(mask2, 2);
1261cabdff1aSopenharmony_ci
1262cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
1263cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
1264cabdff1aSopenharmony_ci    src += src_stride_3x;
1265cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
1266cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
1267cabdff1aSopenharmony_ci
1268cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
1269cabdff1aSopenharmony_ci    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
1270cabdff1aSopenharmony_ci              dst10, dst21);
1271cabdff1aSopenharmony_ci
1272cabdff1aSopenharmony_ci    dst10_r = __lsx_vilvl_h(dst21, dst10);
1273cabdff1aSopenharmony_ci    dst21_r = __lsx_vilvh_h(dst21, dst10);
1274cabdff1aSopenharmony_ci    dst22 = __lsx_vreplvei_d(dst21, 1);
1275cabdff1aSopenharmony_ci
1276cabdff1aSopenharmony_ci    for (loop_cnt = 2; loop_cnt--;) {
1277cabdff1aSopenharmony_ci        src3 = __lsx_vld(src, 0);
1278cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
1279cabdff1aSopenharmony_ci        src6 = __lsx_vldx(src, src_stride_3x);
1280cabdff1aSopenharmony_ci        src += src_stride_4x;
1281cabdff1aSopenharmony_ci        src7 = __lsx_vld(src, 0);
1282cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
1283cabdff1aSopenharmony_ci        src10 = __lsx_vldx(src, src_stride_3x);
1284cabdff1aSopenharmony_ci        src += src_stride_4x;
1285cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8,
1286cabdff1aSopenharmony_ci                  src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3);
1287cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10,
1288cabdff1aSopenharmony_ci                  src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7);
1289cabdff1aSopenharmony_ci
1290cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1291cabdff1aSopenharmony_ci                  vec6, filt0, dst73, dst84, dst95, dst106);
1292cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
1293cabdff1aSopenharmony_ci                  filt1, dst95, vec5, filt1, dst106, vec7, filt1,
1294cabdff1aSopenharmony_ci                  dst73, dst84, dst95, dst106);
1295cabdff1aSopenharmony_ci
1296cabdff1aSopenharmony_ci        dst32_r = __lsx_vilvl_h(dst73, dst22);
1297cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r);
1298cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
1299cabdff1aSopenharmony_ci        dst65_r = __lsx_vilvl_h(dst106, dst95);
1300cabdff1aSopenharmony_ci        dst109_r = __lsx_vilvh_h(dst106, dst95);
1301cabdff1aSopenharmony_ci        dst22 = __lsx_vreplvei_d(dst73, 1);
1302cabdff1aSopenharmony_ci        dst76_r = __lsx_vilvl_h(dst22, dst106);
1303cabdff1aSopenharmony_ci
1304cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
1305cabdff1aSopenharmony_ci                  filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3);
1306cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
1307cabdff1aSopenharmony_ci                  filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7);
1308cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r,
1309cabdff1aSopenharmony_ci                  filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1,
1310cabdff1aSopenharmony_ci                  dst0, dst1, dst2, dst3);
1311cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r,
1312cabdff1aSopenharmony_ci                  filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1,
1313cabdff1aSopenharmony_ci                  dst4, dst5, dst6, dst7);
1314cabdff1aSopenharmony_ci
1315cabdff1aSopenharmony_ci        DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4,
1316cabdff1aSopenharmony_ci                  6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3);
1317cabdff1aSopenharmony_ci        DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1318cabdff1aSopenharmony_ci
1319cabdff1aSopenharmony_ci        __lsx_vstelm_w(out0, dst, 0, 0);
1320cabdff1aSopenharmony_ci        __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
1321cabdff1aSopenharmony_ci        __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
1322cabdff1aSopenharmony_ci        __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
1323cabdff1aSopenharmony_ci        dst += dst_stride_4x;
1324cabdff1aSopenharmony_ci        __lsx_vstelm_w(out1, dst, 0, 0);
1325cabdff1aSopenharmony_ci        __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
1326cabdff1aSopenharmony_ci        __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
1327cabdff1aSopenharmony_ci        __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
1328cabdff1aSopenharmony_ci        dst += dst_stride_4x;
1329cabdff1aSopenharmony_ci
1330cabdff1aSopenharmony_ci        dst10_r = dst98_r;
1331cabdff1aSopenharmony_ci        dst21_r = dst109_r;
1332cabdff1aSopenharmony_ci        dst22 = __lsx_vreplvei_d(dst106, 1);
1333cabdff1aSopenharmony_ci    }
1334cabdff1aSopenharmony_ci}
1335cabdff1aSopenharmony_ci
1336cabdff1aSopenharmony_cistatic void hevc_hv_4t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1337cabdff1aSopenharmony_ci                               int32_t dst_stride, const int8_t *filter_x,
1338cabdff1aSopenharmony_ci                               const int8_t *filter_y, int32_t height)
1339cabdff1aSopenharmony_ci{
1340cabdff1aSopenharmony_ci    if (4 == height) {
1341cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, filter_x,
1342cabdff1aSopenharmony_ci                               filter_y, 2);
1343cabdff1aSopenharmony_ci    } else {
1344cabdff1aSopenharmony_ci        hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1345cabdff1aSopenharmony_ci                                   filter_x, filter_y, height, 2);
1346cabdff1aSopenharmony_ci    }
1347cabdff1aSopenharmony_ci}
1348cabdff1aSopenharmony_ci
1349cabdff1aSopenharmony_cistatic void hevc_hv_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1350cabdff1aSopenharmony_ci                               int32_t dst_stride, const int8_t *filter_x,
1351cabdff1aSopenharmony_ci                               const int8_t *filter_y, int32_t height)
1352cabdff1aSopenharmony_ci{
1353cabdff1aSopenharmony_ci    hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1354cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 3);
1355cabdff1aSopenharmony_ci}
1356cabdff1aSopenharmony_ci
1357cabdff1aSopenharmony_cistatic void hevc_hv_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1358cabdff1aSopenharmony_ci                               int32_t dst_stride, const int8_t *filter_x,
1359cabdff1aSopenharmony_ci                               const int8_t *filter_y, int32_t height)
1360cabdff1aSopenharmony_ci{
1361cabdff1aSopenharmony_ci    hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1362cabdff1aSopenharmony_ci                               filter_x, filter_y, height, 4);
1363cabdff1aSopenharmony_ci}
1364cabdff1aSopenharmony_ci
1365cabdff1aSopenharmony_ci#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                           \
1366cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,           \
1367cabdff1aSopenharmony_ci                                                       ptrdiff_t dst_stride,   \
1368cabdff1aSopenharmony_ci                                                       uint8_t *src,           \
1369cabdff1aSopenharmony_ci                                                       ptrdiff_t src_stride,   \
1370cabdff1aSopenharmony_ci                                                       int height,             \
1371cabdff1aSopenharmony_ci                                                       intptr_t mx,            \
1372cabdff1aSopenharmony_ci                                                       intptr_t my,            \
1373cabdff1aSopenharmony_ci                                                       int width)              \
1374cabdff1aSopenharmony_ci{                                                                              \
1375cabdff1aSopenharmony_ci    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
1376cabdff1aSopenharmony_ci                                                                               \
1377cabdff1aSopenharmony_ci    common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride,  \
1378cabdff1aSopenharmony_ci                                            filter, height);                   \
1379cabdff1aSopenharmony_ci}
1380cabdff1aSopenharmony_ci
1381cabdff1aSopenharmony_ciUNI_MC(qpel, h, 64, 8, hz, mx);
1382cabdff1aSopenharmony_ci
1383cabdff1aSopenharmony_ciUNI_MC(qpel, v, 24, 8, vt, my);
1384cabdff1aSopenharmony_ciUNI_MC(qpel, v, 32, 8, vt, my);
1385cabdff1aSopenharmony_ciUNI_MC(qpel, v, 48, 8, vt, my);
1386cabdff1aSopenharmony_ciUNI_MC(qpel, v, 64, 8, vt, my);
1387cabdff1aSopenharmony_ci
1388cabdff1aSopenharmony_ciUNI_MC(epel, v, 24, 4, vt, my);
1389cabdff1aSopenharmony_ciUNI_MC(epel, v, 32, 4, vt, my);
1390cabdff1aSopenharmony_ci
1391cabdff1aSopenharmony_ci#undef UNI_MC
1392cabdff1aSopenharmony_ci
1393cabdff1aSopenharmony_ci#define UNI_MC_HV(PEL, WIDTH, TAP)                                         \
1394cabdff1aSopenharmony_civoid ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst,          \
1395cabdff1aSopenharmony_ci                                                    ptrdiff_t dst_stride,  \
1396cabdff1aSopenharmony_ci                                                    uint8_t *src,          \
1397cabdff1aSopenharmony_ci                                                    ptrdiff_t src_stride,  \
1398cabdff1aSopenharmony_ci                                                    int height,            \
1399cabdff1aSopenharmony_ci                                                    intptr_t mx,           \
1400cabdff1aSopenharmony_ci                                                    intptr_t my,           \
1401cabdff1aSopenharmony_ci                                                    int width)             \
1402cabdff1aSopenharmony_ci{                                                                          \
1403cabdff1aSopenharmony_ci    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];              \
1404cabdff1aSopenharmony_ci    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];              \
1405cabdff1aSopenharmony_ci                                                                           \
1406cabdff1aSopenharmony_ci    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride,  \
1407cabdff1aSopenharmony_ci                                    filter_x, filter_y, height);       \
1408cabdff1aSopenharmony_ci}
1409cabdff1aSopenharmony_ci
1410cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 8, 8);
1411cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 16, 8);
1412cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 24, 8);
1413cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 32, 8);
1414cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 48, 8);
1415cabdff1aSopenharmony_ciUNI_MC_HV(qpel, 64, 8);
1416cabdff1aSopenharmony_ci
1417cabdff1aSopenharmony_ciUNI_MC_HV(epel, 8, 4);
1418cabdff1aSopenharmony_ciUNI_MC_HV(epel, 12, 4);
1419cabdff1aSopenharmony_ciUNI_MC_HV(epel, 16, 4);
1420cabdff1aSopenharmony_ciUNI_MC_HV(epel, 24, 4);
1421cabdff1aSopenharmony_ciUNI_MC_HV(epel, 32, 4);
1422cabdff1aSopenharmony_ci
1423cabdff1aSopenharmony_ci#undef UNI_MC_HV
1424