1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited
3cabdff1aSopenharmony_ci * Contributed by Hao Chen <chenhao@loongson.cn>
4cabdff1aSopenharmony_ci *
5cabdff1aSopenharmony_ci * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci */
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h"
23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
24cabdff1aSopenharmony_ci#include "vp9dsp_loongarch.h"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4,   \
27cabdff1aSopenharmony_ci                 _dst5, _dst6, _dst7, _dst, _stride,  \
28cabdff1aSopenharmony_ci                 _stride2, _stride3, _stride4)        \
29cabdff1aSopenharmony_ci{                                                     \
30cabdff1aSopenharmony_ci    __lsx_vst(_dst0, _dst, 0);                        \
31cabdff1aSopenharmony_ci    __lsx_vstx(_dst1, _dst, _stride);                 \
32cabdff1aSopenharmony_ci    __lsx_vstx(_dst2, _dst, _stride2);                \
33cabdff1aSopenharmony_ci    __lsx_vstx(_dst3, _dst, _stride3);                \
34cabdff1aSopenharmony_ci    _dst += _stride4;                                 \
35cabdff1aSopenharmony_ci    __lsx_vst(_dst4, _dst, 0);                        \
36cabdff1aSopenharmony_ci    __lsx_vstx(_dst5, _dst, _stride);                 \
37cabdff1aSopenharmony_ci    __lsx_vstx(_dst6, _dst, _stride2);                \
38cabdff1aSopenharmony_ci    __lsx_vstx(_dst7, _dst, _stride3);                \
39cabdff1aSopenharmony_ci}
40cabdff1aSopenharmony_ci
41cabdff1aSopenharmony_ci#define LSX_ST_8X16(_dst0, _dst1, _dst2, _dst3, _dst4,   \
42cabdff1aSopenharmony_ci                    _dst5, _dst6, _dst7, _dst, _stride)  \
43cabdff1aSopenharmony_ci{                                                        \
44cabdff1aSopenharmony_ci    __lsx_vst(_dst0, _dst, 0);                           \
45cabdff1aSopenharmony_ci    __lsx_vst(_dst0, _dst, 16);                          \
46cabdff1aSopenharmony_ci    _dst += _stride;                                     \
47cabdff1aSopenharmony_ci    __lsx_vst(_dst1, _dst, 0);                           \
48cabdff1aSopenharmony_ci    __lsx_vst(_dst1, _dst, 16);                          \
49cabdff1aSopenharmony_ci    _dst += _stride;                                     \
50cabdff1aSopenharmony_ci    __lsx_vst(_dst2, _dst, 0);                           \
51cabdff1aSopenharmony_ci    __lsx_vst(_dst2, _dst, 16);                          \
52cabdff1aSopenharmony_ci    _dst += _stride;                                     \
53cabdff1aSopenharmony_ci    __lsx_vst(_dst3, _dst, 0);                           \
54cabdff1aSopenharmony_ci    __lsx_vst(_dst3, _dst, 16);                          \
55cabdff1aSopenharmony_ci    _dst += _stride;                                     \
56cabdff1aSopenharmony_ci    __lsx_vst(_dst4, _dst, 0);                           \
57cabdff1aSopenharmony_ci    __lsx_vst(_dst4, _dst, 16);                          \
58cabdff1aSopenharmony_ci    _dst += _stride;                                     \
59cabdff1aSopenharmony_ci    __lsx_vst(_dst5, _dst, 0);                           \
60cabdff1aSopenharmony_ci    __lsx_vst(_dst5, _dst, 16);                          \
61cabdff1aSopenharmony_ci    _dst += _stride;                                     \
62cabdff1aSopenharmony_ci    __lsx_vst(_dst6, _dst, 0);                           \
63cabdff1aSopenharmony_ci    __lsx_vst(_dst6, _dst, 16);                          \
64cabdff1aSopenharmony_ci    _dst += _stride;                                     \
65cabdff1aSopenharmony_ci    __lsx_vst(_dst7, _dst, 0);                           \
66cabdff1aSopenharmony_ci    __lsx_vst(_dst7, _dst, 16);                          \
67cabdff1aSopenharmony_ci    _dst += _stride;                                     \
68cabdff1aSopenharmony_ci}
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_civoid ff_vert_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
71cabdff1aSopenharmony_ci                       const uint8_t *src)
72cabdff1aSopenharmony_ci{
73cabdff1aSopenharmony_ci    __m128i src0;
74cabdff1aSopenharmony_ci    ptrdiff_t stride2 = dst_stride << 1;
75cabdff1aSopenharmony_ci    ptrdiff_t stride3 = stride2 + dst_stride;
76cabdff1aSopenharmony_ci    ptrdiff_t stride4 = stride2 << 1;
77cabdff1aSopenharmony_ci    src0 = __lsx_vld(src, 0);
78cabdff1aSopenharmony_ci    LSX_ST_8(src0, src0, src0, src0, src0, src0, src0, src0, dst,
79cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);
80cabdff1aSopenharmony_ci    dst += stride4;
81cabdff1aSopenharmony_ci    LSX_ST_8(src0, src0, src0, src0, src0, src0, src0, src0, dst,
82cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);
83cabdff1aSopenharmony_ci}
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_civoid ff_vert_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
86cabdff1aSopenharmony_ci                       const uint8_t *src)
87cabdff1aSopenharmony_ci{
88cabdff1aSopenharmony_ci    uint32_t row;
89cabdff1aSopenharmony_ci    __m128i src0, src1;
90cabdff1aSopenharmony_ci
91cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
92cabdff1aSopenharmony_ci    for (row = 32; row--;) {
93cabdff1aSopenharmony_ci        __lsx_vst(src0, dst, 0);
94cabdff1aSopenharmony_ci        __lsx_vst(src1, dst, 16);
95cabdff1aSopenharmony_ci        dst += dst_stride;
96cabdff1aSopenharmony_ci    }
97cabdff1aSopenharmony_ci}
98cabdff1aSopenharmony_ci
99cabdff1aSopenharmony_civoid ff_hor_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
100cabdff1aSopenharmony_ci                      const uint8_t *top)
101cabdff1aSopenharmony_ci{
102cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
103cabdff1aSopenharmony_ci    __m128i src8, src9, src10, src11, src12, src13, src14, src15;
104cabdff1aSopenharmony_ci    ptrdiff_t stride2 = dst_stride << 1;
105cabdff1aSopenharmony_ci    ptrdiff_t stride3 = stride2 + dst_stride;
106cabdff1aSopenharmony_ci    ptrdiff_t stride4 = stride2 << 1;
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ci    src15 = __lsx_vldrepl_b(src, 0);
109cabdff1aSopenharmony_ci    src14 = __lsx_vldrepl_b(src, 1);
110cabdff1aSopenharmony_ci    src13 = __lsx_vldrepl_b(src, 2);
111cabdff1aSopenharmony_ci    src12 = __lsx_vldrepl_b(src, 3);
112cabdff1aSopenharmony_ci    src11 = __lsx_vldrepl_b(src, 4);
113cabdff1aSopenharmony_ci    src10 = __lsx_vldrepl_b(src, 5);
114cabdff1aSopenharmony_ci    src9  = __lsx_vldrepl_b(src, 6);
115cabdff1aSopenharmony_ci    src8  = __lsx_vldrepl_b(src, 7);
116cabdff1aSopenharmony_ci    src7  = __lsx_vldrepl_b(src, 8);
117cabdff1aSopenharmony_ci    src6  = __lsx_vldrepl_b(src, 9);
118cabdff1aSopenharmony_ci    src5  = __lsx_vldrepl_b(src, 10);
119cabdff1aSopenharmony_ci    src4  = __lsx_vldrepl_b(src, 11);
120cabdff1aSopenharmony_ci    src3  = __lsx_vldrepl_b(src, 12);
121cabdff1aSopenharmony_ci    src2  = __lsx_vldrepl_b(src, 13);
122cabdff1aSopenharmony_ci    src1  = __lsx_vldrepl_b(src, 14);
123cabdff1aSopenharmony_ci    src0  = __lsx_vldrepl_b(src, 15);
124cabdff1aSopenharmony_ci    LSX_ST_8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
125cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);
126cabdff1aSopenharmony_ci    dst += stride4;
127cabdff1aSopenharmony_ci    LSX_ST_8(src8, src9, src10, src11, src12, src13, src14, src15, dst,
128cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);
129cabdff1aSopenharmony_ci}
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_civoid ff_hor_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
132cabdff1aSopenharmony_ci                      const uint8_t *top)
133cabdff1aSopenharmony_ci{
134cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
135cabdff1aSopenharmony_ci    __m128i src8, src9, src10, src11, src12, src13, src14, src15;
136cabdff1aSopenharmony_ci    __m128i src16, src17, src18, src19, src20, src21, src22, src23;
137cabdff1aSopenharmony_ci    __m128i src24, src25, src26, src27, src28, src29, src30, src31;
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci    src31 = __lsx_vldrepl_b(src, 0);
140cabdff1aSopenharmony_ci    src30 = __lsx_vldrepl_b(src, 1);
141cabdff1aSopenharmony_ci    src29 = __lsx_vldrepl_b(src, 2);
142cabdff1aSopenharmony_ci    src28 = __lsx_vldrepl_b(src, 3);
143cabdff1aSopenharmony_ci    src27 = __lsx_vldrepl_b(src, 4);
144cabdff1aSopenharmony_ci    src26 = __lsx_vldrepl_b(src, 5);
145cabdff1aSopenharmony_ci    src25 = __lsx_vldrepl_b(src, 6);
146cabdff1aSopenharmony_ci    src24 = __lsx_vldrepl_b(src, 7);
147cabdff1aSopenharmony_ci    src23 = __lsx_vldrepl_b(src, 8);
148cabdff1aSopenharmony_ci    src22 = __lsx_vldrepl_b(src, 9);
149cabdff1aSopenharmony_ci    src21 = __lsx_vldrepl_b(src, 10);
150cabdff1aSopenharmony_ci    src20 = __lsx_vldrepl_b(src, 11);
151cabdff1aSopenharmony_ci    src19 = __lsx_vldrepl_b(src, 12);
152cabdff1aSopenharmony_ci    src18 = __lsx_vldrepl_b(src, 13);
153cabdff1aSopenharmony_ci    src17 = __lsx_vldrepl_b(src, 14);
154cabdff1aSopenharmony_ci    src16 = __lsx_vldrepl_b(src, 15);
155cabdff1aSopenharmony_ci    src15 = __lsx_vldrepl_b(src, 16);
156cabdff1aSopenharmony_ci    src14 = __lsx_vldrepl_b(src, 17);
157cabdff1aSopenharmony_ci    src13 = __lsx_vldrepl_b(src, 18);
158cabdff1aSopenharmony_ci    src12 = __lsx_vldrepl_b(src, 19);
159cabdff1aSopenharmony_ci    src11 = __lsx_vldrepl_b(src, 20);
160cabdff1aSopenharmony_ci    src10 = __lsx_vldrepl_b(src, 21);
161cabdff1aSopenharmony_ci    src9  = __lsx_vldrepl_b(src, 22);
162cabdff1aSopenharmony_ci    src8  = __lsx_vldrepl_b(src, 23);
163cabdff1aSopenharmony_ci    src7  = __lsx_vldrepl_b(src, 24);
164cabdff1aSopenharmony_ci    src6  = __lsx_vldrepl_b(src, 25);
165cabdff1aSopenharmony_ci    src5  = __lsx_vldrepl_b(src, 26);
166cabdff1aSopenharmony_ci    src4  = __lsx_vldrepl_b(src, 27);
167cabdff1aSopenharmony_ci    src3  = __lsx_vldrepl_b(src, 28);
168cabdff1aSopenharmony_ci    src2  = __lsx_vldrepl_b(src, 29);
169cabdff1aSopenharmony_ci    src1  = __lsx_vldrepl_b(src, 30);
170cabdff1aSopenharmony_ci    src0  = __lsx_vldrepl_b(src, 31);
171cabdff1aSopenharmony_ci    LSX_ST_8X16(src0, src1, src2, src3, src4, src5, src6, src7,
172cabdff1aSopenharmony_ci                dst, dst_stride);
173cabdff1aSopenharmony_ci    LSX_ST_8X16(src8, src9, src10, src11, src12, src13, src14, src15,
174cabdff1aSopenharmony_ci                dst, dst_stride);
175cabdff1aSopenharmony_ci    LSX_ST_8X16(src16, src17, src18, src19, src20, src21, src22, src23,
176cabdff1aSopenharmony_ci                dst, dst_stride);
177cabdff1aSopenharmony_ci    LSX_ST_8X16(src24, src25, src26, src27, src28, src29, src30, src31,
178cabdff1aSopenharmony_ci                dst, dst_stride);
179cabdff1aSopenharmony_ci}
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_civoid ff_dc_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
182cabdff1aSopenharmony_ci                   const uint8_t *src_top)
183cabdff1aSopenharmony_ci{
184cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, dst0;
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci    tmp0 = __lsx_vldrepl_w(src_top, 0);
187cabdff1aSopenharmony_ci    tmp1 = __lsx_vldrepl_w(src_left, 0);
188cabdff1aSopenharmony_ci    dst0 = __lsx_vilvl_w(tmp1, tmp0);
189cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_hu_bu(dst0, dst0);
190cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);
191cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);
192cabdff1aSopenharmony_ci    dst0 = __lsx_vsrari_w(dst0, 3);
193cabdff1aSopenharmony_ci    dst0 = __lsx_vshuf4i_b(dst0, 0);
194cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);
195cabdff1aSopenharmony_ci    dst += dst_stride;
196cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);
197cabdff1aSopenharmony_ci    dst += dst_stride;
198cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);
199cabdff1aSopenharmony_ci    dst += dst_stride;
200cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);
201cabdff1aSopenharmony_ci}
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci#define INTRA_DC_TL_4X4(dir)                                            \
204cabdff1aSopenharmony_civoid ff_dc_##dir##_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride,          \
205cabdff1aSopenharmony_ci                          const uint8_t *left,                          \
206cabdff1aSopenharmony_ci                          const uint8_t *top)                           \
207cabdff1aSopenharmony_ci{                                                                       \
208cabdff1aSopenharmony_ci    __m128i tmp0, dst0;                                                 \
209cabdff1aSopenharmony_ci                                                                        \
210cabdff1aSopenharmony_ci    tmp0 = __lsx_vldrepl_w(dir, 0);                                     \
211cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0);                              \
212cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);                              \
213cabdff1aSopenharmony_ci    dst0 = __lsx_vsrari_w(dst0, 2);                                     \
214cabdff1aSopenharmony_ci    dst0 = __lsx_vshuf4i_b(dst0, 0);                                    \
215cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);                                    \
216cabdff1aSopenharmony_ci    dst += dst_stride;                                                  \
217cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);                                    \
218cabdff1aSopenharmony_ci    dst += dst_stride;                                                  \
219cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);                                    \
220cabdff1aSopenharmony_ci    dst += dst_stride;                                                  \
221cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);                                    \
222cabdff1aSopenharmony_ci}
223cabdff1aSopenharmony_ciINTRA_DC_TL_4X4(top);
224cabdff1aSopenharmony_ciINTRA_DC_TL_4X4(left);
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_civoid ff_dc_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
227cabdff1aSopenharmony_ci                   const uint8_t *src_top)
228cabdff1aSopenharmony_ci{
229cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, dst0;
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci    tmp0 = __lsx_vldrepl_d(src_top, 0);
232cabdff1aSopenharmony_ci    tmp1 = __lsx_vldrepl_d(src_left, 0);
233cabdff1aSopenharmony_ci    dst0 = __lsx_vilvl_d(tmp1, tmp0);
234cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_hu_bu(dst0, dst0);
235cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);
236cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);
237cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);
238cabdff1aSopenharmony_ci    dst0 = __lsx_vsrari_w(dst0, 4);
239cabdff1aSopenharmony_ci    dst0 = __lsx_vreplvei_b(dst0, 0);
240cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);
241cabdff1aSopenharmony_ci    dst += dst_stride;
242cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);
243cabdff1aSopenharmony_ci    dst += dst_stride;
244cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);
245cabdff1aSopenharmony_ci    dst += dst_stride;
246cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);
247cabdff1aSopenharmony_ci    dst += dst_stride;
248cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);
249cabdff1aSopenharmony_ci    dst += dst_stride;
250cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);
251cabdff1aSopenharmony_ci    dst += dst_stride;
252cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);
253cabdff1aSopenharmony_ci    dst += dst_stride;
254cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);
255cabdff1aSopenharmony_ci}
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ci#define INTRA_DC_TL_8X8(dir)                                                  \
258cabdff1aSopenharmony_civoid ff_dc_##dir##_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride,                \
259cabdff1aSopenharmony_ci                           const uint8_t *left,                               \
260cabdff1aSopenharmony_ci                           const uint8_t *top)                                \
261cabdff1aSopenharmony_ci{                                                                             \
262cabdff1aSopenharmony_ci    __m128i tmp0, dst0;                                                       \
263cabdff1aSopenharmony_ci                                                                              \
264cabdff1aSopenharmony_ci    tmp0 = __lsx_vldrepl_d(dir, 0);                                           \
265cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0);                                    \
266cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);                                    \
267cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);                                    \
268cabdff1aSopenharmony_ci    dst0 = __lsx_vsrari_w(dst0, 3);                                           \
269cabdff1aSopenharmony_ci    dst0 = __lsx_vreplvei_b(dst0, 0);                                         \
270cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
271cabdff1aSopenharmony_ci    dst += dst_stride;                                                        \
272cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
273cabdff1aSopenharmony_ci    dst += dst_stride;                                                        \
274cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
275cabdff1aSopenharmony_ci    dst += dst_stride;                                                        \
276cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
277cabdff1aSopenharmony_ci    dst += dst_stride;                                                        \
278cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
279cabdff1aSopenharmony_ci    dst += dst_stride;                                                        \
280cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
281cabdff1aSopenharmony_ci    dst += dst_stride;                                                        \
282cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
283cabdff1aSopenharmony_ci    dst += dst_stride;                                                        \
284cabdff1aSopenharmony_ci    __lsx_vstelm_d(dst0, dst, 0, 0);                                          \
285cabdff1aSopenharmony_ci}
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ciINTRA_DC_TL_8X8(top);
288cabdff1aSopenharmony_ciINTRA_DC_TL_8X8(left);
289cabdff1aSopenharmony_ci
290cabdff1aSopenharmony_civoid ff_dc_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
291cabdff1aSopenharmony_ci                     const uint8_t *src_left, const uint8_t *src_top)
292cabdff1aSopenharmony_ci{
293cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, dst0;
294cabdff1aSopenharmony_ci    ptrdiff_t stride2 = dst_stride << 1;
295cabdff1aSopenharmony_ci    ptrdiff_t stride3 = stride2 + dst_stride;
296cabdff1aSopenharmony_ci    ptrdiff_t stride4 = stride2 << 1;
297cabdff1aSopenharmony_ci
298cabdff1aSopenharmony_ci    tmp0 = __lsx_vld(src_top, 0);
299cabdff1aSopenharmony_ci    tmp1 = __lsx_vld(src_left, 0);
300cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp0, tmp1);
301cabdff1aSopenharmony_ci    dst0 = __lsx_vadd_h(tmp0, tmp1);
302cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);
303cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);
304cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);
305cabdff1aSopenharmony_ci    dst0 = __lsx_vsrari_w(dst0, 5);
306cabdff1aSopenharmony_ci    dst0 = __lsx_vreplvei_b(dst0, 0);
307cabdff1aSopenharmony_ci    LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst,
308cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);
309cabdff1aSopenharmony_ci    dst += stride4;
310cabdff1aSopenharmony_ci    LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst,
311cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);
312cabdff1aSopenharmony_ci}
313cabdff1aSopenharmony_ci
314cabdff1aSopenharmony_ci#define INTRA_DC_TL_16X16(dir)                                                \
315cabdff1aSopenharmony_civoid ff_dc_##dir##_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride,              \
316cabdff1aSopenharmony_ci                             const uint8_t *left,                             \
317cabdff1aSopenharmony_ci                             const uint8_t *top)                              \
318cabdff1aSopenharmony_ci{                                                                             \
319cabdff1aSopenharmony_ci    __m128i tmp0, dst0;                                                       \
320cabdff1aSopenharmony_ci    ptrdiff_t stride2 = dst_stride << 1;                                      \
321cabdff1aSopenharmony_ci    ptrdiff_t stride3 = stride2 + dst_stride;                                 \
322cabdff1aSopenharmony_ci    ptrdiff_t stride4 = stride2 << 1;                                         \
323cabdff1aSopenharmony_ci                                                                              \
324cabdff1aSopenharmony_ci    tmp0 = __lsx_vld(dir, 0);                                                 \
325cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_hu_bu(tmp0, tmp0);                                    \
326cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);                                    \
327cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);                                    \
328cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);                                    \
329cabdff1aSopenharmony_ci    dst0 = __lsx_vsrari_w(dst0, 4);                                           \
330cabdff1aSopenharmony_ci    dst0 = __lsx_vreplvei_b(dst0, 0);                                         \
331cabdff1aSopenharmony_ci    LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst,             \
332cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);                          \
333cabdff1aSopenharmony_ci    dst += stride4;                                                           \
334cabdff1aSopenharmony_ci    LSX_ST_8(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst,             \
335cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);                          \
336cabdff1aSopenharmony_ci}
337cabdff1aSopenharmony_ci
338cabdff1aSopenharmony_ciINTRA_DC_TL_16X16(top);
339cabdff1aSopenharmony_ciINTRA_DC_TL_16X16(left);
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_civoid ff_dc_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride,
342cabdff1aSopenharmony_ci                     const uint8_t *src_left, const uint8_t *src_top)
343cabdff1aSopenharmony_ci{
344cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3, dst0;
345cabdff1aSopenharmony_ci
346cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, src_top, 0, src_top, 16, tmp0, tmp1);
347cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, src_left, 0, src_left, 16, tmp2, tmp3);
348cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2,
349cabdff1aSopenharmony_ci              tmp3, tmp3, tmp0, tmp1, tmp2, tmp3);
350cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp0, tmp1);
351cabdff1aSopenharmony_ci    dst0 = __lsx_vadd_h(tmp0, tmp1);
352cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);
353cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);
354cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);
355cabdff1aSopenharmony_ci    dst0 = __lsx_vsrari_w(dst0, 6);
356cabdff1aSopenharmony_ci    dst0 = __lsx_vreplvei_b(dst0, 0);
357cabdff1aSopenharmony_ci    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,
358cabdff1aSopenharmony_ci                dst, dst_stride);
359cabdff1aSopenharmony_ci    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,
360cabdff1aSopenharmony_ci                dst, dst_stride);
361cabdff1aSopenharmony_ci    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,
362cabdff1aSopenharmony_ci                dst, dst_stride);
363cabdff1aSopenharmony_ci    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,
364cabdff1aSopenharmony_ci                dst, dst_stride);
365cabdff1aSopenharmony_ci}
366cabdff1aSopenharmony_ci
367cabdff1aSopenharmony_ci#define INTRA_DC_TL_32X32(dir)                                               \
368cabdff1aSopenharmony_civoid ff_dc_##dir##_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride,             \
369cabdff1aSopenharmony_ci                             const uint8_t *left,                            \
370cabdff1aSopenharmony_ci                             const uint8_t *top)                             \
371cabdff1aSopenharmony_ci{                                                                            \
372cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, dst0;                                                \
373cabdff1aSopenharmony_ci                                                                             \
374cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, dir, 0, dir, 16, tmp0, tmp1);                       \
375cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp0, tmp1);       \
376cabdff1aSopenharmony_ci    dst0 = __lsx_vadd_h(tmp0, tmp1);                                         \
377cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_wu_hu(dst0, dst0);                                   \
378cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_du_wu(dst0, dst0);                                   \
379cabdff1aSopenharmony_ci    dst0 = __lsx_vhaddw_qu_du(dst0, dst0);                                   \
380cabdff1aSopenharmony_ci    dst0 = __lsx_vsrari_w(dst0, 5);                                          \
381cabdff1aSopenharmony_ci    dst0 = __lsx_vreplvei_b(dst0, 0);                                        \
382cabdff1aSopenharmony_ci    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,              \
383cabdff1aSopenharmony_ci                dst, dst_stride);                                            \
384cabdff1aSopenharmony_ci    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,              \
385cabdff1aSopenharmony_ci                dst, dst_stride);                                            \
386cabdff1aSopenharmony_ci    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,              \
387cabdff1aSopenharmony_ci                dst, dst_stride);                                            \
388cabdff1aSopenharmony_ci    LSX_ST_8X16(dst0, dst0, dst0, dst0, dst0, dst0, dst0, dst0,              \
389cabdff1aSopenharmony_ci                dst, dst_stride);                                            \
390cabdff1aSopenharmony_ci}
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ciINTRA_DC_TL_32X32(top);
393cabdff1aSopenharmony_ciINTRA_DC_TL_32X32(left);
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci#define INTRA_PREDICT_VALDC_16X16_LSX(val)                             \
396cabdff1aSopenharmony_civoid ff_dc_##val##_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride,       \
397cabdff1aSopenharmony_ci                             const uint8_t *left, const uint8_t *top)  \
398cabdff1aSopenharmony_ci{                                                                      \
399cabdff1aSopenharmony_ci    __m128i out = __lsx_vldi(val);                                     \
400cabdff1aSopenharmony_ci    ptrdiff_t stride2 = dst_stride << 1;                               \
401cabdff1aSopenharmony_ci    ptrdiff_t stride3 = stride2 + dst_stride;                          \
402cabdff1aSopenharmony_ci    ptrdiff_t stride4 = stride2 << 1;                                  \
403cabdff1aSopenharmony_ci                                                                       \
404cabdff1aSopenharmony_ci    LSX_ST_8(out, out, out, out, out, out, out, out, dst,              \
405cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);                   \
406cabdff1aSopenharmony_ci    dst += stride4;                                                    \
407cabdff1aSopenharmony_ci    LSX_ST_8(out, out, out, out, out, out, out, out, dst,              \
408cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);                   \
409cabdff1aSopenharmony_ci}
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_LSX(127);
412cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_LSX(128);
413cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_16X16_LSX(129);
414cabdff1aSopenharmony_ci
415cabdff1aSopenharmony_ci#define INTRA_PREDICT_VALDC_32X32_LSX(val)                               \
416cabdff1aSopenharmony_civoid ff_dc_##val##_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride,         \
417cabdff1aSopenharmony_ci                             const uint8_t *left, const uint8_t *top)    \
418cabdff1aSopenharmony_ci{                                                                        \
419cabdff1aSopenharmony_ci    __m128i out = __lsx_vldi(val);                                       \
420cabdff1aSopenharmony_ci                                                                         \
421cabdff1aSopenharmony_ci    LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\
422cabdff1aSopenharmony_ci    LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\
423cabdff1aSopenharmony_ci    LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\
424cabdff1aSopenharmony_ci    LSX_ST_8X16(out, out, out, out, out, out, out, out, dst, dst_stride);\
425cabdff1aSopenharmony_ci}
426cabdff1aSopenharmony_ci
427cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_LSX(127);
428cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_LSX(128);
429cabdff1aSopenharmony_ciINTRA_PREDICT_VALDC_32X32_LSX(129);
430cabdff1aSopenharmony_ci
431cabdff1aSopenharmony_civoid ff_tm_4x4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
432cabdff1aSopenharmony_ci                   const uint8_t *src_left, const uint8_t *src_top_ptr)
433cabdff1aSopenharmony_ci{
434cabdff1aSopenharmony_ci    uint8_t top_left = src_top_ptr[-1];
435cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3, reg0, reg1;
436cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3;
437cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3;
438cabdff1aSopenharmony_ci
439cabdff1aSopenharmony_ci    reg0 = __lsx_vreplgr2vr_h(top_left);
440cabdff1aSopenharmony_ci    reg1 = __lsx_vld(src_top_ptr, 0);
441cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left,
442cabdff1aSopenharmony_ci              3, tmp3, tmp2, tmp1, tmp0);
443cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3, reg1,
444cabdff1aSopenharmony_ci              src0, src1, src2, src3);
445cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vhaddw_hu_bu, src0, src0, src1, src1, src2, src2, src3,
446cabdff1aSopenharmony_ci              src3, dst0, dst1, dst2, dst3);
447cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, dst0, reg0, dst1, reg0, dst2, reg0, dst3, reg0,
448cabdff1aSopenharmony_ci              dst0, dst1, dst2, dst3);
449cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, dst0, 7, dst1, 7, dst2, 7, dst3, 7,
450cabdff1aSopenharmony_ci              dst0, dst1, dst2, dst3);
451cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
452cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 0);
453cabdff1aSopenharmony_ci    dst += dst_stride;
454cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst0, dst, 0, 2);
455cabdff1aSopenharmony_ci    dst += dst_stride;
456cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst1, dst, 0, 0);
457cabdff1aSopenharmony_ci    dst += dst_stride;
458cabdff1aSopenharmony_ci    __lsx_vstelm_w(dst1, dst, 0, 2);
459cabdff1aSopenharmony_ci}
460cabdff1aSopenharmony_ci
461cabdff1aSopenharmony_civoid ff_tm_8x8_lsx(uint8_t *dst, ptrdiff_t dst_stride,
462cabdff1aSopenharmony_ci                   const uint8_t *src_left, const uint8_t *src_top_ptr)
463cabdff1aSopenharmony_ci{
464cabdff1aSopenharmony_ci    uint8_t top_left = src_top_ptr[-1];
465cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
466cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
467cabdff1aSopenharmony_ci    __m128i reg0, reg1;
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_ci    reg0 = __lsx_vreplgr2vr_h(top_left);
470cabdff1aSopenharmony_ci    reg1 = __lsx_vld(src_top_ptr, 0);
471cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left,
472cabdff1aSopenharmony_ci              3, tmp7, tmp6, tmp5, tmp4);
473cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_b, src_left, 4, src_left, 5, src_left, 6, src_left,
474cabdff1aSopenharmony_ci              7, tmp3, tmp2, tmp1, tmp0);
475cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3, reg1,
476cabdff1aSopenharmony_ci              src0, src1, src2, src3);
477cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7, reg1,
478cabdff1aSopenharmony_ci              src4, src5, src6, src7);
479cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vhaddw_hu_bu, src0, src0, src1, src1, src2, src2, src3,
480cabdff1aSopenharmony_ci              src3, src0, src1, src2, src3);
481cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vhaddw_hu_bu, src4, src4, src5, src5, src6, src6, src7,
482cabdff1aSopenharmony_ci              src7, src4, src5, src6, src7);
483cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
484cabdff1aSopenharmony_ci              src0, src1, src2, src3);
485cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
486cabdff1aSopenharmony_ci              src4, src5, src6, src7);
487cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
488cabdff1aSopenharmony_ci              src0, src1, src2, src3);
489cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
490cabdff1aSopenharmony_ci              src4, src5, src6, src7);
491cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, src5, src4, src7, src6,
492cabdff1aSopenharmony_ci              src0, src1, src2, src3);
493cabdff1aSopenharmony_ci    __lsx_vstelm_d(src0, dst, 0, 0);
494cabdff1aSopenharmony_ci    dst += dst_stride;
495cabdff1aSopenharmony_ci    __lsx_vstelm_d(src0, dst, 0, 1);
496cabdff1aSopenharmony_ci    dst += dst_stride;
497cabdff1aSopenharmony_ci    __lsx_vstelm_d(src1, dst, 0, 0);
498cabdff1aSopenharmony_ci    dst += dst_stride;
499cabdff1aSopenharmony_ci    __lsx_vstelm_d(src1, dst, 0, 1);
500cabdff1aSopenharmony_ci    dst += dst_stride;
501cabdff1aSopenharmony_ci    __lsx_vstelm_d(src2, dst, 0, 0);
502cabdff1aSopenharmony_ci    dst += dst_stride;
503cabdff1aSopenharmony_ci    __lsx_vstelm_d(src2, dst, 0, 1);
504cabdff1aSopenharmony_ci    dst += dst_stride;
505cabdff1aSopenharmony_ci    __lsx_vstelm_d(src3, dst, 0, 0);
506cabdff1aSopenharmony_ci    dst += dst_stride;
507cabdff1aSopenharmony_ci    __lsx_vstelm_d(src3, dst, 0, 1);
508cabdff1aSopenharmony_ci}
509cabdff1aSopenharmony_ci
510cabdff1aSopenharmony_civoid ff_tm_16x16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
511cabdff1aSopenharmony_ci                     const uint8_t *src_left, const uint8_t *src_top_ptr)
512cabdff1aSopenharmony_ci{
513cabdff1aSopenharmony_ci    uint8_t top_left = src_top_ptr[-1];
514cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
515cabdff1aSopenharmony_ci    __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
516cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
517cabdff1aSopenharmony_ci    __m128i reg0, reg1;
518cabdff1aSopenharmony_ci    ptrdiff_t stride2 = dst_stride << 1;
519cabdff1aSopenharmony_ci    ptrdiff_t stride3 = stride2 + dst_stride;
520cabdff1aSopenharmony_ci    ptrdiff_t stride4 = stride2 << 1;
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_ci    reg0 = __lsx_vreplgr2vr_h(top_left);
523cabdff1aSopenharmony_ci    reg1 = __lsx_vld(src_top_ptr, 0);
524cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2, src_left,
525cabdff1aSopenharmony_ci              3, tmp15, tmp14, tmp13, tmp12);
526cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_b, src_left, 4, src_left, 5, src_left, 6, src_left,
527cabdff1aSopenharmony_ci              7, tmp11, tmp10, tmp9, tmp8);
528cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_b, src_left, 8, src_left, 9, src_left, 10,
529cabdff1aSopenharmony_ci              src_left, 11, tmp7, tmp6, tmp5, tmp4);
530cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vldrepl_b, src_left, 12, src_left, 13, src_left, 14,
531cabdff1aSopenharmony_ci              src_left, 15, tmp3, tmp2, tmp1, tmp0);
532cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3,
533cabdff1aSopenharmony_ci              reg1, src0, src1, src2, src3);
534cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1, tmp3,
535cabdff1aSopenharmony_ci              reg1, src4, src5, src6, src7);
536cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
537cabdff1aSopenharmony_ci              src0, src1, src2, src3);
538cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
539cabdff1aSopenharmony_ci              src4, src5, src6, src7);
540cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
541cabdff1aSopenharmony_ci              src0, src1, src2, src3);
542cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
543cabdff1aSopenharmony_ci              src4, src5, src6, src7);
544cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3,
545cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
546cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddwev_h_bu, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7,
547cabdff1aSopenharmony_ci              reg1, src0, src1, src2, src3);
548cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddwod_h_bu, tmp4, reg1, tmp5, reg1, tmp6, reg1, tmp7,
549cabdff1aSopenharmony_ci              reg1, src4, src5, src6, src7);
550cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
551cabdff1aSopenharmony_ci              src0, src1, src2, src3);
552cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
553cabdff1aSopenharmony_ci              src4, src5, src6, src7);
554cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
555cabdff1aSopenharmony_ci              src0, src1, src2, src3);
556cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
557cabdff1aSopenharmony_ci              src4, src5, src6, src7);
558cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3,
559cabdff1aSopenharmony_ci              tmp4, tmp5, tmp6, tmp7);
560cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddwev_h_bu, tmp8, reg1, tmp9, reg1, tmp10, reg1, tmp11,
561cabdff1aSopenharmony_ci              reg1, src0, src1, src2, src3);
562cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddwod_h_bu, tmp8, reg1, tmp9, reg1, tmp10, reg1, tmp11,
563cabdff1aSopenharmony_ci              reg1, src4, src5, src6, src7);
564cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
565cabdff1aSopenharmony_ci              src0, src1, src2, src3);
566cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
567cabdff1aSopenharmony_ci              src4, src5, src6, src7);
568cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
569cabdff1aSopenharmony_ci              src0, src1, src2, src3);
570cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
571cabdff1aSopenharmony_ci              src4, src5, src6, src7);
572cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3,
573cabdff1aSopenharmony_ci              tmp8, tmp9, tmp10, tmp11);
574cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddwev_h_bu, tmp12, reg1, tmp13, reg1, tmp14, reg1,
575cabdff1aSopenharmony_ci              tmp15, reg1, src0, src1, src2, src3);
576cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vaddwod_h_bu, tmp12, reg1, tmp13, reg1, tmp14, reg1,
577cabdff1aSopenharmony_ci              tmp15, reg1, src4, src5, src6, src7);
578cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3, reg0,
579cabdff1aSopenharmony_ci              src0, src1, src2, src3);
580cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7, reg0,
581cabdff1aSopenharmony_ci              src4, src5, src6, src7);
582cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
583cabdff1aSopenharmony_ci              src0, src1, src2, src3);
584cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
585cabdff1aSopenharmony_ci              src4, src5, src6, src7);
586cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7, src3,
587cabdff1aSopenharmony_ci              tmp12, tmp13, tmp14, tmp15);
588cabdff1aSopenharmony_ci    LSX_ST_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, dst,
589cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);
590cabdff1aSopenharmony_ci    dst += stride4;
591cabdff1aSopenharmony_ci    LSX_ST_8(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, dst,
592cabdff1aSopenharmony_ci             dst_stride, stride2, stride3, stride4);
593cabdff1aSopenharmony_ci}
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_civoid ff_tm_32x32_lsx(uint8_t *dst, ptrdiff_t dst_stride,
596cabdff1aSopenharmony_ci                     const uint8_t *src_left, const uint8_t *src_top_ptr)
597cabdff1aSopenharmony_ci{
598cabdff1aSopenharmony_ci    uint8_t top_left = src_top_ptr[-1];
599cabdff1aSopenharmony_ci    uint32_t loop_cnt;
600cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3, reg0, reg1, reg2;
601cabdff1aSopenharmony_ci    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
602cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
603cabdff1aSopenharmony_ci
604cabdff1aSopenharmony_ci    reg0 = __lsx_vreplgr2vr_h(top_left);
605cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vld, src_top_ptr, 0, src_top_ptr, 16, reg1, reg2);
606cabdff1aSopenharmony_ci
607cabdff1aSopenharmony_ci    src_left += 28;
608cabdff1aSopenharmony_ci    for (loop_cnt = 8; loop_cnt--;) {
609cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vldrepl_b, src_left, 0, src_left, 1, src_left, 2,
610cabdff1aSopenharmony_ci                  src_left, 3, tmp3, tmp2, tmp1, tmp0);
611cabdff1aSopenharmony_ci        src_left -= 4;
612cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1,
613cabdff1aSopenharmony_ci                  tmp3, reg1, src0, src1, src2, src3);
614cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg1, tmp1, reg1, tmp2, reg1,
615cabdff1aSopenharmony_ci                  tmp3, reg1, src4, src5, src6, src7);
616cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vssub_hu, src0, reg0, src1, reg0, src2, reg0, src3,
617cabdff1aSopenharmony_ci                  reg0, src0, src1, src2, src3);
618cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vssub_hu, src4, reg0, src5, reg0, src6, reg0, src7,
619cabdff1aSopenharmony_ci                  reg0, src4, src5, src6, src7);
620cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vaddwev_h_bu, tmp0, reg2, tmp1, reg2, tmp2, reg2,
621cabdff1aSopenharmony_ci                  tmp3, reg2, dst0, dst1, dst2, dst3);
622cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vaddwod_h_bu, tmp0, reg2, tmp1, reg2, tmp2, reg2,
623cabdff1aSopenharmony_ci                  tmp3, reg2, dst4, dst5, dst6, dst7);
624cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vssub_hu, dst0, reg0, dst1, reg0, dst2, reg0, dst3,
625cabdff1aSopenharmony_ci                  reg0, dst0, dst1, dst2, dst3);
626cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vssub_hu, dst4, reg0, dst5, reg0, dst6, reg0, dst7,
627cabdff1aSopenharmony_ci                  reg0, dst4, dst5, dst6, dst7);
628cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsat_hu, src0, 7, src1, 7, src2, 7, src3, 7,
629cabdff1aSopenharmony_ci                  src0, src1, src2, src3);
630cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsat_hu, src4, 7, src5, 7, src6, 7, src7, 7,
631cabdff1aSopenharmony_ci                  src4, src5, src6, src7);
632cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsat_hu, dst0, 7, dst1, 7, dst2, 7, dst3, 7,
633cabdff1aSopenharmony_ci                  dst0, dst1, dst2, dst3);
634cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vsat_hu, dst4, 7, dst5, 7, dst6, 7, dst7, 7,
635cabdff1aSopenharmony_ci                  dst4, dst5, dst6, dst7);
636cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vpackev_b, src4, src0, src5, src1, src6, src2, src7,
637cabdff1aSopenharmony_ci                  src3, src0, src1, src2, src3);
638cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vpackev_b, dst4, dst0, dst5, dst1, dst6, dst2, dst7,
639cabdff1aSopenharmony_ci                  dst3, dst0, dst1, dst2, dst3);
640cabdff1aSopenharmony_ci        __lsx_vst(src0, dst, 0);
641cabdff1aSopenharmony_ci        __lsx_vst(dst0, dst, 16);
642cabdff1aSopenharmony_ci        dst += dst_stride;
643cabdff1aSopenharmony_ci        __lsx_vst(src1, dst, 0);
644cabdff1aSopenharmony_ci        __lsx_vst(dst1, dst, 16);
645cabdff1aSopenharmony_ci        dst += dst_stride;
646cabdff1aSopenharmony_ci        __lsx_vst(src2, dst, 0);
647cabdff1aSopenharmony_ci        __lsx_vst(dst2, dst, 16);
648cabdff1aSopenharmony_ci        dst += dst_stride;
649cabdff1aSopenharmony_ci        __lsx_vst(src3, dst, 0);
650cabdff1aSopenharmony_ci        __lsx_vst(dst3, dst, 16);
651cabdff1aSopenharmony_ci        dst += dst_stride;
652cabdff1aSopenharmony_ci    }
653cabdff1aSopenharmony_ci}
654