1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited
3cabdff1aSopenharmony_ci * Contributed by Jin Bo <jinbo@loongson.cn>
4cabdff1aSopenharmony_ci *
5cabdff1aSopenharmony_ci * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci */
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci#include "libavcodec/vp9dsp.h"
23cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
24cabdff1aSopenharmony_ci#include "vp9dsp_loongarch.h"
25cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci#define VP9_DCT_CONST_BITS   14
28cabdff1aSopenharmony_ci#define ALLOC_ALIGNED(align) __attribute__ ((aligned(align)))
29cabdff1aSopenharmony_ci#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ciconst int32_t cospi_1_64 = 16364;
32cabdff1aSopenharmony_ciconst int32_t cospi_2_64 = 16305;
33cabdff1aSopenharmony_ciconst int32_t cospi_3_64 = 16207;
34cabdff1aSopenharmony_ciconst int32_t cospi_4_64 = 16069;
35cabdff1aSopenharmony_ciconst int32_t cospi_5_64 = 15893;
36cabdff1aSopenharmony_ciconst int32_t cospi_6_64 = 15679;
37cabdff1aSopenharmony_ciconst int32_t cospi_7_64 = 15426;
38cabdff1aSopenharmony_ciconst int32_t cospi_8_64 = 15137;
39cabdff1aSopenharmony_ciconst int32_t cospi_9_64 = 14811;
40cabdff1aSopenharmony_ciconst int32_t cospi_10_64 = 14449;
41cabdff1aSopenharmony_ciconst int32_t cospi_11_64 = 14053;
42cabdff1aSopenharmony_ciconst int32_t cospi_12_64 = 13623;
43cabdff1aSopenharmony_ciconst int32_t cospi_13_64 = 13160;
44cabdff1aSopenharmony_ciconst int32_t cospi_14_64 = 12665;
45cabdff1aSopenharmony_ciconst int32_t cospi_15_64 = 12140;
46cabdff1aSopenharmony_ciconst int32_t cospi_16_64 = 11585;
47cabdff1aSopenharmony_ciconst int32_t cospi_17_64 = 11003;
48cabdff1aSopenharmony_ciconst int32_t cospi_18_64 = 10394;
49cabdff1aSopenharmony_ciconst int32_t cospi_19_64 = 9760;
50cabdff1aSopenharmony_ciconst int32_t cospi_20_64 = 9102;
51cabdff1aSopenharmony_ciconst int32_t cospi_21_64 = 8423;
52cabdff1aSopenharmony_ciconst int32_t cospi_22_64 = 7723;
53cabdff1aSopenharmony_ciconst int32_t cospi_23_64 = 7005;
54cabdff1aSopenharmony_ciconst int32_t cospi_24_64 = 6270;
55cabdff1aSopenharmony_ciconst int32_t cospi_25_64 = 5520;
56cabdff1aSopenharmony_ciconst int32_t cospi_26_64 = 4756;
57cabdff1aSopenharmony_ciconst int32_t cospi_27_64 = 3981;
58cabdff1aSopenharmony_ciconst int32_t cospi_28_64 = 3196;
59cabdff1aSopenharmony_ciconst int32_t cospi_29_64 = 2404;
60cabdff1aSopenharmony_ciconst int32_t cospi_30_64 = 1606;
61cabdff1aSopenharmony_ciconst int32_t cospi_31_64 = 804;
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ciconst int32_t sinpi_1_9 = 5283;
64cabdff1aSopenharmony_ciconst int32_t sinpi_2_9 = 9929;
65cabdff1aSopenharmony_ciconst int32_t sinpi_3_9 = 13377;
66cabdff1aSopenharmony_ciconst int32_t sinpi_4_9 = 15212;
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)  \
69cabdff1aSopenharmony_ci{                                                                  \
70cabdff1aSopenharmony_ci    __m128i k0_m = __lsx_vreplgr2vr_h(cnst0);                      \
71cabdff1aSopenharmony_ci    __m128i s0_m, s1_m, s2_m, s3_m;                                \
72cabdff1aSopenharmony_ci                                                                   \
73cabdff1aSopenharmony_ci    s0_m = __lsx_vreplgr2vr_h(cnst1);                              \
74cabdff1aSopenharmony_ci    k0_m = __lsx_vpackev_h(s0_m, k0_m);                            \
75cabdff1aSopenharmony_ci                                                                   \
76cabdff1aSopenharmony_ci    s1_m = __lsx_vilvl_h(__lsx_vneg_h(reg1), reg0);                \
77cabdff1aSopenharmony_ci    s0_m = __lsx_vilvh_h(__lsx_vneg_h(reg1), reg0);                \
78cabdff1aSopenharmony_ci    s3_m = __lsx_vilvl_h(reg0, reg1);                              \
79cabdff1aSopenharmony_ci    s2_m = __lsx_vilvh_h(reg0, reg1);                              \
80cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_w_h, s1_m, k0_m, s0_m, k0_m, s1_m, s0_m); \
81cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsrari_w, s1_m, VP9_DCT_CONST_BITS,            \
82cabdff1aSopenharmony_ci              s0_m, VP9_DCT_CONST_BITS, s1_m, s0_m);               \
83cabdff1aSopenharmony_ci    out0 = __lsx_vpickev_h(s0_m, s1_m);                            \
84cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k0_m, s2_m, k0_m, s1_m, s0_m); \
85cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsrari_w, s1_m, VP9_DCT_CONST_BITS,            \
86cabdff1aSopenharmony_ci              s0_m, VP9_DCT_CONST_BITS, s1_m, s0_m);               \
87cabdff1aSopenharmony_ci    out1 = __lsx_vpickev_h(s0_m, s1_m);                            \
88cabdff1aSopenharmony_ci}
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci#define VP9_SET_COSPI_PAIR(c0_h, c1_h)    \
91cabdff1aSopenharmony_ci( {                                       \
92cabdff1aSopenharmony_ci    __m128i out0_m, r0_m, r1_m;           \
93cabdff1aSopenharmony_ci                                          \
94cabdff1aSopenharmony_ci    r0_m = __lsx_vreplgr2vr_h(c0_h);      \
95cabdff1aSopenharmony_ci    r1_m = __lsx_vreplgr2vr_h(c1_h);      \
96cabdff1aSopenharmony_ci    out0_m = __lsx_vpackev_h(r1_m, r0_m); \
97cabdff1aSopenharmony_ci                                          \
98cabdff1aSopenharmony_ci    out0_m;                               \
99cabdff1aSopenharmony_ci} )
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_ci#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)      \
102cabdff1aSopenharmony_ci{                                                                     \
103cabdff1aSopenharmony_ci    uint8_t *dst_m = (uint8_t *) (dst);                               \
104cabdff1aSopenharmony_ci    __m128i dst0_m, dst1_m, dst2_m, dst3_m;                           \
105cabdff1aSopenharmony_ci    __m128i tmp0_m, tmp1_m;                                           \
106cabdff1aSopenharmony_ci    __m128i res0_m, res1_m, res2_m, res3_m;                           \
107cabdff1aSopenharmony_ci    __m128i zero_m = __lsx_vldi(0);                                   \
108cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, dst_m, 0, dst_m + dst_stride, 0,             \
109cabdff1aSopenharmony_ci              dst_m + 2 * dst_stride, 0, dst_m + 3 * dst_stride, 0,   \
110cabdff1aSopenharmony_ci              dst0_m, dst1_m, dst2_m, dst3_m);                        \
111cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, zero_m, dst0_m, zero_m, dst1_m, zero_m,  \
112cabdff1aSopenharmony_ci              dst2_m, zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);\
113cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2,    \
114cabdff1aSopenharmony_ci              res3_m, in3, res0_m, res1_m, res2_m, res3_m);           \
115cabdff1aSopenharmony_ci    DUP4_ARG1(__lsx_vclip255_h, res0_m, res1_m, res2_m, res3_m,       \
116cabdff1aSopenharmony_ci              res0_m, res1_m, res2_m, res3_m);                        \
117cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_b, res1_m, res0_m, res3_m, res2_m,        \
118cabdff1aSopenharmony_ci              tmp0_m, tmp1_m);                                        \
119cabdff1aSopenharmony_ci    __lsx_vstelm_d(tmp0_m, dst_m, 0, 0);                              \
120cabdff1aSopenharmony_ci    __lsx_vstelm_d(tmp0_m, dst_m + dst_stride, 0, 1);                 \
121cabdff1aSopenharmony_ci    __lsx_vstelm_d(tmp1_m, dst_m + 2 * dst_stride, 0, 0);             \
122cabdff1aSopenharmony_ci    __lsx_vstelm_d(tmp1_m, dst_m + 3 * dst_stride, 0, 1);             \
123cabdff1aSopenharmony_ci}
124cabdff1aSopenharmony_ci
125cabdff1aSopenharmony_ci#define VP9_UNPCK_UB_SH(in, out_h, out_l) \
126cabdff1aSopenharmony_ci{                                         \
127cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);         \
128cabdff1aSopenharmony_ci    out_l = __lsx_vilvl_b(zero, in);      \
129cabdff1aSopenharmony_ci    out_h = __lsx_vilvh_b(zero, in);      \
130cabdff1aSopenharmony_ci}
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci#define VP9_ILVLTRANS4x8_H(in0, in1, in2, in3, in4, in5, in6, in7,          \
133cabdff1aSopenharmony_ci                           out0, out1, out2, out3, out4, out5, out6, out7)  \
134cabdff1aSopenharmony_ci{                                                                           \
135cabdff1aSopenharmony_ci    __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
136cabdff1aSopenharmony_ci    __m128i tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                 \
137cabdff1aSopenharmony_ci    __m128i zero_m = __lsx_vldi(0);                                         \
138cabdff1aSopenharmony_ci                                                                            \
139cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,        \
140cabdff1aSopenharmony_ci              tmp0_n, tmp1_n, tmp2_n, tmp3_n);                              \
141cabdff1aSopenharmony_ci    tmp0_m = __lsx_vilvl_w(tmp1_n, tmp0_n);                                 \
142cabdff1aSopenharmony_ci    tmp2_m = __lsx_vilvh_w(tmp1_n, tmp0_n);                                 \
143cabdff1aSopenharmony_ci    tmp1_m = __lsx_vilvl_w(tmp3_n, tmp2_n);                                 \
144cabdff1aSopenharmony_ci    tmp3_m = __lsx_vilvh_w(tmp3_n, tmp2_n);                                 \
145cabdff1aSopenharmony_ci                                                                            \
146cabdff1aSopenharmony_ci    out0 = __lsx_vilvl_d(tmp1_m, tmp0_m);                                   \
147cabdff1aSopenharmony_ci    out1 = __lsx_vilvh_d(tmp1_m, tmp0_m);                                   \
148cabdff1aSopenharmony_ci    out2 = __lsx_vilvl_d(tmp3_m, tmp2_m);                                   \
149cabdff1aSopenharmony_ci    out3 = __lsx_vilvh_d(tmp3_m, tmp2_m);                                   \
150cabdff1aSopenharmony_ci                                                                            \
151cabdff1aSopenharmony_ci    out4 = zero_m;                                                          \
152cabdff1aSopenharmony_ci    out5 = zero_m;                                                          \
153cabdff1aSopenharmony_ci    out6 = zero_m;                                                          \
154cabdff1aSopenharmony_ci    out7 = zero_m;                                                          \
155cabdff1aSopenharmony_ci}
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci/* multiply and add macro */
158cabdff1aSopenharmony_ci#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,            \
159cabdff1aSopenharmony_ci                 out0, out1, out2, out3)                                    \
160cabdff1aSopenharmony_ci{                                                                           \
161cabdff1aSopenharmony_ci    __m128i madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
162cabdff1aSopenharmony_ci    __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
163cabdff1aSopenharmony_ci                                                                            \
164cabdff1aSopenharmony_ci    madd_s1_m = __lsx_vilvl_h(inp1, inp0);                                  \
165cabdff1aSopenharmony_ci    madd_s0_m = __lsx_vilvh_h(inp1, inp0);                                  \
166cabdff1aSopenharmony_ci    madd_s3_m = __lsx_vilvl_h(inp3, inp2);                                  \
167cabdff1aSopenharmony_ci    madd_s2_m = __lsx_vilvh_h(inp3, inp2);                                  \
168cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, madd_s1_m, cst0, madd_s0_m, cst0,             \
169cabdff1aSopenharmony_ci              madd_s1_m, cst1, madd_s0_m, cst1, tmp0_m, tmp1_m,             \
170cabdff1aSopenharmony_ci              tmp2_m, tmp3_m);                                              \
171cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS, tmp1_m,           \
172cabdff1aSopenharmony_ci              VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS, tmp3_m,       \
173cabdff1aSopenharmony_ci              VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m);          \
174cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
175cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, madd_s3_m, cst2, madd_s2_m, cst2, madd_s3_m,  \
176cabdff1aSopenharmony_ci              cst3, madd_s2_m, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);       \
177cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS,                   \
178cabdff1aSopenharmony_ci              tmp1_m, VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS,       \
179cabdff1aSopenharmony_ci              tmp3_m, VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
180cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \
181cabdff1aSopenharmony_ci}
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)                           \
184cabdff1aSopenharmony_ci( {                                                                          \
185cabdff1aSopenharmony_ci    __m128i c0_m, c1_m;                                                      \
186cabdff1aSopenharmony_ci                                                                             \
187cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vreplvei_h, mask_h, idx1_h, mask_h, idx2_h, c0_m, c1_m); \
188cabdff1aSopenharmony_ci    c0_m = __lsx_vpackev_h(c1_m, c0_m);                                      \
189cabdff1aSopenharmony_ci                                                                             \
190cabdff1aSopenharmony_ci    c0_m;                                                                    \
191cabdff1aSopenharmony_ci} )
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci/* idct 8x8 macro */
194cabdff1aSopenharmony_ci#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,                 \
195cabdff1aSopenharmony_ci                       out0, out1, out2, out3, out4, out5, out6, out7)         \
196cabdff1aSopenharmony_ci{                                                                              \
197cabdff1aSopenharmony_ci    __m128i tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;            \
198cabdff1aSopenharmony_ci    __m128i k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;            \
199cabdff1aSopenharmony_ci    __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
200cabdff1aSopenharmony_ci    v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
201cabdff1aSopenharmony_ci          cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };              \
202cabdff1aSopenharmony_ci                                                                               \
203cabdff1aSopenharmony_ci    k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
204cabdff1aSopenharmony_ci    k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
205cabdff1aSopenharmony_ci    k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
206cabdff1aSopenharmony_ci    k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
207cabdff1aSopenharmony_ci    VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
208cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsub_h, in1, in3, in7, in5, res0_m, res1_m);               \
209cabdff1aSopenharmony_ci    k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
210cabdff1aSopenharmony_ci    k1_m = __lsx_vreplvei_h(mask_m, 4);                                        \
211cabdff1aSopenharmony_ci                                                                               \
212cabdff1aSopenharmony_ci    res2_m = __lsx_vilvl_h(res0_m, res1_m);                                    \
213cabdff1aSopenharmony_ci    res3_m = __lsx_vilvh_h(res0_m, res1_m);                                    \
214cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, res2_m, k0_m, res3_m, k0_m, res2_m, k1_m,        \
215cabdff1aSopenharmony_ci              res3_m, k1_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
216cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS,                      \
217cabdff1aSopenharmony_ci              tmp1_m, VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS,          \
218cabdff1aSopenharmony_ci              tmp3_m, VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m);     \
219cabdff1aSopenharmony_ci    tp4_m = __lsx_vadd_h(in1, in3);                                            \
220cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);  \
221cabdff1aSopenharmony_ci    tp7_m = __lsx_vadd_h(in7, in5);                                            \
222cabdff1aSopenharmony_ci    k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
223cabdff1aSopenharmony_ci    k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
224cabdff1aSopenharmony_ci    VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
225cabdff1aSopenharmony_ci             in0, in4, in2, in6);                                              \
226cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);         \
227cabdff1aSopenharmony_ci    LSX_BUTTERFLY_8_H(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,  \
228cabdff1aSopenharmony_ci                  out0, out1, out2, out3, out4, out5, out6, out7);             \
229cabdff1aSopenharmony_ci}
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_cistatic av_always_inline
232cabdff1aSopenharmony_civoid vp9_idct8x8_1_add_lsx(int16_t *input, uint8_t *dst,
233cabdff1aSopenharmony_ci                                  int32_t dst_stride)
234cabdff1aSopenharmony_ci{
235cabdff1aSopenharmony_ci    int16_t out;
236cabdff1aSopenharmony_ci    int32_t val;
237cabdff1aSopenharmony_ci    __m128i vec;
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ci    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
240cabdff1aSopenharmony_ci    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
241cabdff1aSopenharmony_ci    val = ROUND_POWER_OF_TWO(out, 5);
242cabdff1aSopenharmony_ci    vec = __lsx_vreplgr2vr_h(val);
243cabdff1aSopenharmony_ci    input[0] = 0;
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
246cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
247cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
248cabdff1aSopenharmony_ci}
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_cistatic void vp9_idct8x8_12_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
251cabdff1aSopenharmony_ci                                             int32_t dst_stride)
252cabdff1aSopenharmony_ci{
253cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
254cabdff1aSopenharmony_ci    __m128i s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
255cabdff1aSopenharmony_ci    __m128i tmp0, tmp1, tmp2, tmp3;
256cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
257cabdff1aSopenharmony_ci
258cabdff1aSopenharmony_ci    /* load vector elements of 8x8 block */
259cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48,
260cabdff1aSopenharmony_ci              in0, in1, in2, in3);
261cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112,
262cabdff1aSopenharmony_ci              in4, in5, in6, in7);
263cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 0);
264cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 16);
265cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32);
266cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 48);
267cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 64);
268cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 80);
269cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 96);
270cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 112);
271cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_d,in1, in0, in3, in2, in5, in4, in7,
272cabdff1aSopenharmony_ci              in6, in0, in1, in2, in3);
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ci    /* stage1 */
275cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_h, in3, in0, in2, in1, s0, s1);
276cabdff1aSopenharmony_ci    k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
277cabdff1aSopenharmony_ci    k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
278cabdff1aSopenharmony_ci    k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
279cabdff1aSopenharmony_ci    k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
280cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, s0, k0, s0, k1, s1, k2, s1, k3,
281cabdff1aSopenharmony_ci              tmp0, tmp1, tmp2, tmp3);
282cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1,
283cabdff1aSopenharmony_ci              VP9_DCT_CONST_BITS, tmp2, VP9_DCT_CONST_BITS, tmp3,
284cabdff1aSopenharmony_ci              VP9_DCT_CONST_BITS, tmp0, tmp1, tmp2, tmp3);
285cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, zero, tmp2, zero, tmp3,
286cabdff1aSopenharmony_ci              s0, s1, s2, s3);
287cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(s0, s1, s3, s2, s4, s7, s6, s5);
288cabdff1aSopenharmony_ci
289cabdff1aSopenharmony_ci    /* stage2 */
290cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_h, in3, in1, in2, in0, s1, s0);
291cabdff1aSopenharmony_ci    k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
292cabdff1aSopenharmony_ci    k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
293cabdff1aSopenharmony_ci    k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
294cabdff1aSopenharmony_ci    k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
295cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vdp2_w_h, s0, k0, s0, k1, s1, k2, s1, k3,
296cabdff1aSopenharmony_ci                  tmp0, tmp1, tmp2, tmp3);
297cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1,
298cabdff1aSopenharmony_ci              VP9_DCT_CONST_BITS, tmp2, VP9_DCT_CONST_BITS, tmp3,
299cabdff1aSopenharmony_ci              VP9_DCT_CONST_BITS, tmp0, tmp1, tmp2, tmp3);
300cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, zero, tmp2, zero, tmp3,
301cabdff1aSopenharmony_ci              s0, s1, s2, s3);
302cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(s0, s1, s2, s3, m0, m1, m2, m3);
303cabdff1aSopenharmony_ci
304cabdff1aSopenharmony_ci    /* stage3 */
305cabdff1aSopenharmony_ci    s0 = __lsx_vilvl_h(s6, s5);
306cabdff1aSopenharmony_ci
307cabdff1aSopenharmony_ci    k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
308cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vdp2_w_h, s0, k1, s0, k0, tmp0, tmp1);
309cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1,
310cabdff1aSopenharmony_ci              VP9_DCT_CONST_BITS, tmp0, tmp1);
311cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, s2, s3);
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    /* stage4 */
314cabdff1aSopenharmony_ci    LSX_BUTTERFLY_8_H(m0, m1, m2, m3, s4, s2, s3, s7,
315cabdff1aSopenharmony_ci                      in0, in1, in2, in3, in4, in5, in6, in7);
316cabdff1aSopenharmony_ci    VP9_ILVLTRANS4x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
317cabdff1aSopenharmony_ci                       in0, in1, in2, in3, in4, in5, in6, in7);
318cabdff1aSopenharmony_ci    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
319cabdff1aSopenharmony_ci                   in0, in1, in2, in3, in4, in5, in6, in7);
320cabdff1aSopenharmony_ci
321cabdff1aSopenharmony_ci    /* final rounding (add 2^4, divide by 2^5) and shift */
322cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, in0 , 5, in1, 5, in2, 5, in3, 5,
323cabdff1aSopenharmony_ci              in0, in1, in2, in3);
324cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, in4 , 5, in5, 5, in6, 5, in7, 5,
325cabdff1aSopenharmony_ci              in4, in5, in6, in7);
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci    /* add block and store 8x8 */
328cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
329cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
330cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
331cabdff1aSopenharmony_ci}
332cabdff1aSopenharmony_ci
333cabdff1aSopenharmony_cistatic void vp9_idct8x8_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
334cabdff1aSopenharmony_ci                                          int32_t dst_stride)
335cabdff1aSopenharmony_ci{
336cabdff1aSopenharmony_ci    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
337cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
338cabdff1aSopenharmony_ci
339cabdff1aSopenharmony_ci    /* load vector elements of 8x8 block */
340cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48,
341cabdff1aSopenharmony_ci              in0, in1, in2, in3);
342cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112,
343cabdff1aSopenharmony_ci              in4, in5, in6, in7);
344cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 0);
345cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 16);
346cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32);
347cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 48);
348cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 64);
349cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 80);
350cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 96);
351cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 112);
352cabdff1aSopenharmony_ci    /* 1D idct8x8 */
353cabdff1aSopenharmony_ci    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
354cabdff1aSopenharmony_ci                   in0, in1, in2, in3, in4, in5, in6, in7);
355cabdff1aSopenharmony_ci    /* columns transform */
356cabdff1aSopenharmony_ci    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
357cabdff1aSopenharmony_ci                       in0, in1, in2, in3, in4, in5, in6, in7);
358cabdff1aSopenharmony_ci    /* 1D idct8x8 */
359cabdff1aSopenharmony_ci    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
360cabdff1aSopenharmony_ci                   in0, in1, in2, in3, in4, in5, in6, in7);
361cabdff1aSopenharmony_ci    /* final rounding (add 2^4, divide by 2^5) and shift */
362cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, in0, 5, in1, 5, in2, 5, in3, 5,
363cabdff1aSopenharmony_ci              in0, in1, in2, in3);
364cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, in4, 5, in5, 5, in6, 5, in7, 5,
365cabdff1aSopenharmony_ci              in4, in5, in6, in7);
366cabdff1aSopenharmony_ci    /* add block and store 8x8 */
367cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
368cabdff1aSopenharmony_ci    dst += (4 * dst_stride);
369cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
370cabdff1aSopenharmony_ci}
371cabdff1aSopenharmony_ci
372cabdff1aSopenharmony_cistatic void vp9_idct16_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
373cabdff1aSopenharmony_ci                                             int32_t dst_stride)
374cabdff1aSopenharmony_ci{
375cabdff1aSopenharmony_ci    __m128i loc0, loc1, loc2, loc3;
376cabdff1aSopenharmony_ci    __m128i reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
377cabdff1aSopenharmony_ci    __m128i reg1, reg3, reg5, reg7, reg9, reg11, reg13, reg15;
378cabdff1aSopenharmony_ci    __m128i tmp5, tmp6, tmp7;
379cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
380cabdff1aSopenharmony_ci    int32_t offset = dst_stride << 2;
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 32*0, input, 32*1, input, 32*2, input, 32*3,
383cabdff1aSopenharmony_ci              reg0, reg1, reg2, reg3);
384cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 32*4, input, 32*5, input, 32*6, input, 32*7,
385cabdff1aSopenharmony_ci              reg4, reg5, reg6, reg7);
386cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 32*8, input, 32*9, input, 32*10, input, 32*11,
387cabdff1aSopenharmony_ci              reg8, reg9, reg10, reg11);
388cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 32*12, input, 32*13, input, 32*14, input,
389cabdff1aSopenharmony_ci              32*15, reg12, reg13, reg14, reg15);
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*0);
392cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*1);
393cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*2);
394cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*3);
395cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*4);
396cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*5);
397cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*6);
398cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*7);
399cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*8);
400cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*9);
401cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*10);
402cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*11);
403cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*12);
404cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*13);
405cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*14);
406cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*15);
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
409cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
410cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
411cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
412cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
413cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
414cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
415cabdff1aSopenharmony_ci
416cabdff1aSopenharmony_ci    reg0 = __lsx_vsub_h(reg2, loc1);
417cabdff1aSopenharmony_ci    reg2 = __lsx_vadd_h(reg2, loc1);
418cabdff1aSopenharmony_ci    reg12 = __lsx_vsub_h(reg14, loc0);
419cabdff1aSopenharmony_ci    reg14 = __lsx_vadd_h(reg14, loc0);
420cabdff1aSopenharmony_ci    reg4 = __lsx_vsub_h(reg6, loc3);
421cabdff1aSopenharmony_ci    reg6 = __lsx_vadd_h(reg6, loc3);
422cabdff1aSopenharmony_ci    reg8 = __lsx_vsub_h(reg10, loc2);
423cabdff1aSopenharmony_ci    reg10 = __lsx_vadd_h(reg10, loc2);
424cabdff1aSopenharmony_ci
425cabdff1aSopenharmony_ci    /* stage2 */
426cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
427cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ci    reg9 = __lsx_vsub_h(reg1, loc2);
430cabdff1aSopenharmony_ci    reg1 = __lsx_vadd_h(reg1, loc2);
431cabdff1aSopenharmony_ci    reg7 = __lsx_vsub_h(reg15, loc3);
432cabdff1aSopenharmony_ci    reg15 = __lsx_vadd_h(reg15, loc3);
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
435cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
436cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
437cabdff1aSopenharmony_ci
438cabdff1aSopenharmony_ci    loc1 = __lsx_vadd_h(reg15, reg3);
439cabdff1aSopenharmony_ci    reg3 = __lsx_vsub_h(reg15, reg3);
440cabdff1aSopenharmony_ci    loc2 = __lsx_vadd_h(reg2, loc1);
441cabdff1aSopenharmony_ci    reg15 = __lsx_vsub_h(reg2, loc1);
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci    loc1 = __lsx_vadd_h(reg1, reg13);
444cabdff1aSopenharmony_ci    reg13 = __lsx_vsub_h(reg1, reg13);
445cabdff1aSopenharmony_ci    loc0 = __lsx_vadd_h(reg0, loc1);
446cabdff1aSopenharmony_ci    loc1 = __lsx_vsub_h(reg0, loc1);
447cabdff1aSopenharmony_ci    tmp6 = loc0;
448cabdff1aSopenharmony_ci    tmp7 = loc1;
449cabdff1aSopenharmony_ci    reg0 = loc2;
450cabdff1aSopenharmony_ci
451cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
452cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg5), __lsx_vneg_h(reg11), cospi_8_64,
453cabdff1aSopenharmony_ci                        cospi_24_64, reg5, reg11);
454cabdff1aSopenharmony_ci
455cabdff1aSopenharmony_ci    loc0 = __lsx_vadd_h(reg9, reg5);
456cabdff1aSopenharmony_ci    reg5 = __lsx_vsub_h(reg9, reg5);
457cabdff1aSopenharmony_ci    reg2 = __lsx_vadd_h(reg6, loc0);
458cabdff1aSopenharmony_ci    reg1 = __lsx_vsub_h(reg6, loc0);
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci    loc0 = __lsx_vadd_h(reg7, reg11);
461cabdff1aSopenharmony_ci    reg11 = __lsx_vsub_h(reg7, reg11);
462cabdff1aSopenharmony_ci    loc1 = __lsx_vadd_h(reg4, loc0);
463cabdff1aSopenharmony_ci    loc2 = __lsx_vsub_h(reg4, loc0);
464cabdff1aSopenharmony_ci    tmp5 = loc1;
465cabdff1aSopenharmony_ci
466cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
467cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_ci    reg10 = loc0;
470cabdff1aSopenharmony_ci    reg11 = loc1;
471cabdff1aSopenharmony_ci
472cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
473cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
474cabdff1aSopenharmony_ci    reg13 = loc2;
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ci    /* Transpose and store the output */
477cabdff1aSopenharmony_ci    reg12 = tmp5;
478cabdff1aSopenharmony_ci    reg14 = tmp6;
479cabdff1aSopenharmony_ci    reg3 = tmp7;
480cabdff1aSopenharmony_ci
481cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, reg0, 6, reg2, 6, reg4, 6, reg6, 6,
482cabdff1aSopenharmony_ci              reg0, reg2, reg4, reg6);
483cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
484cabdff1aSopenharmony_ci    dst += offset;
485cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, reg8, 6, reg10, 6, reg12, 6, reg14, 6,
486cabdff1aSopenharmony_ci              reg8, reg10, reg12, reg14);
487cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
488cabdff1aSopenharmony_ci    dst += offset;
489cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, reg3, 6, reg5, 6, reg11, 6, reg13, 6,
490cabdff1aSopenharmony_ci              reg3, reg5, reg11, reg13);
491cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
492cabdff1aSopenharmony_ci    dst += offset;
493cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, reg1, 6, reg7, 6, reg9, 6, reg15, 6,
494cabdff1aSopenharmony_ci              reg1, reg7, reg9, reg15);
495cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
496cabdff1aSopenharmony_ci}
497cabdff1aSopenharmony_ci
498cabdff1aSopenharmony_cistatic void vp9_idct16_1d_columns_lsx(int16_t *input, int16_t *output)
499cabdff1aSopenharmony_ci{
500cabdff1aSopenharmony_ci    __m128i loc0, loc1, loc2, loc3;
501cabdff1aSopenharmony_ci    __m128i reg1, reg3, reg5, reg7, reg9, reg11, reg13, reg15;
502cabdff1aSopenharmony_ci    __m128i reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
503cabdff1aSopenharmony_ci    __m128i tmp5, tmp6, tmp7;
504cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
505cabdff1aSopenharmony_ci    int16_t *offset;
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 32*0, input, 32*1, input, 32*2, input, 32*3,
508cabdff1aSopenharmony_ci              reg0, reg1, reg2, reg3);
509cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 32*4, input, 32*5, input, 32*6, input, 32*7,
510cabdff1aSopenharmony_ci              reg4, reg5, reg6, reg7);
511cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 32*8, input, 32*9, input, 32*10, input, 32*11,
512cabdff1aSopenharmony_ci              reg8, reg9, reg10, reg11);
513cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, input, 32*12, input, 32*13, input, 32*14, input,
514cabdff1aSopenharmony_ci              32*15, reg12, reg13, reg14, reg15);
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*0);
517cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*1);
518cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*2);
519cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*3);
520cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*4);
521cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*5);
522cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*6);
523cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*7);
524cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*8);
525cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*9);
526cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*10);
527cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*11);
528cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*12);
529cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*13);
530cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*14);
531cabdff1aSopenharmony_ci    __lsx_vst(zero, input, 32*15);
532cabdff1aSopenharmony_ci
533cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
534cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
535cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
536cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
537cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
538cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
539cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
540cabdff1aSopenharmony_ci
541cabdff1aSopenharmony_ci    reg0 = __lsx_vsub_h(reg2, loc1);
542cabdff1aSopenharmony_ci    reg2 = __lsx_vadd_h(reg2, loc1);
543cabdff1aSopenharmony_ci    reg12 = __lsx_vsub_h(reg14, loc0);
544cabdff1aSopenharmony_ci    reg14 = __lsx_vadd_h(reg14, loc0);
545cabdff1aSopenharmony_ci    reg4 = __lsx_vsub_h(reg6, loc3);
546cabdff1aSopenharmony_ci    reg6 = __lsx_vadd_h(reg6, loc3);
547cabdff1aSopenharmony_ci    reg8 = __lsx_vsub_h(reg10, loc2);
548cabdff1aSopenharmony_ci    reg10 = __lsx_vadd_h(reg10, loc2);
549cabdff1aSopenharmony_ci
550cabdff1aSopenharmony_ci    /* stage2 */
551cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
552cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
553cabdff1aSopenharmony_ci
554cabdff1aSopenharmony_ci    reg9 = __lsx_vsub_h(reg1, loc2);
555cabdff1aSopenharmony_ci    reg1 = __lsx_vadd_h(reg1, loc2);
556cabdff1aSopenharmony_ci    reg7 = __lsx_vsub_h(reg15, loc3);
557cabdff1aSopenharmony_ci    reg15 = __lsx_vadd_h(reg15, loc3);
558cabdff1aSopenharmony_ci
559cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
560cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
561cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
562cabdff1aSopenharmony_ci
563cabdff1aSopenharmony_ci    loc1 = __lsx_vadd_h(reg15, reg3);
564cabdff1aSopenharmony_ci    reg3 = __lsx_vsub_h(reg15, reg3);
565cabdff1aSopenharmony_ci    loc2 = __lsx_vadd_h(reg2, loc1);
566cabdff1aSopenharmony_ci    reg15 = __lsx_vsub_h(reg2, loc1);
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci    loc1 = __lsx_vadd_h(reg1, reg13);
569cabdff1aSopenharmony_ci    reg13 = __lsx_vsub_h(reg1, reg13);
570cabdff1aSopenharmony_ci    loc0 = __lsx_vadd_h(reg0, loc1);
571cabdff1aSopenharmony_ci    loc1 = __lsx_vsub_h(reg0, loc1);
572cabdff1aSopenharmony_ci    tmp6 = loc0;
573cabdff1aSopenharmony_ci    tmp7 = loc1;
574cabdff1aSopenharmony_ci    reg0 = loc2;
575cabdff1aSopenharmony_ci
576cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
577cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg5), __lsx_vneg_h(reg11), cospi_8_64,
578cabdff1aSopenharmony_ci                        cospi_24_64, reg5, reg11);
579cabdff1aSopenharmony_ci
580cabdff1aSopenharmony_ci    loc0 = __lsx_vadd_h(reg9, reg5);
581cabdff1aSopenharmony_ci    reg5 = __lsx_vsub_h(reg9, reg5);
582cabdff1aSopenharmony_ci    reg2 = __lsx_vadd_h(reg6, loc0);
583cabdff1aSopenharmony_ci    reg1 = __lsx_vsub_h(reg6, loc0);
584cabdff1aSopenharmony_ci
585cabdff1aSopenharmony_ci    loc0 = __lsx_vadd_h(reg7, reg11);
586cabdff1aSopenharmony_ci    reg11 = __lsx_vsub_h(reg7, reg11);
587cabdff1aSopenharmony_ci    loc1 = __lsx_vadd_h(reg4, loc0);
588cabdff1aSopenharmony_ci    loc2 = __lsx_vsub_h(reg4, loc0);
589cabdff1aSopenharmony_ci
590cabdff1aSopenharmony_ci    tmp5 = loc1;
591cabdff1aSopenharmony_ci
592cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
593cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_ci    reg10 = loc0;
596cabdff1aSopenharmony_ci    reg11 = loc1;
597cabdff1aSopenharmony_ci
598cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
599cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
600cabdff1aSopenharmony_ci    reg13 = loc2;
601cabdff1aSopenharmony_ci
602cabdff1aSopenharmony_ci    /* Transpose and store the output */
603cabdff1aSopenharmony_ci    reg12 = tmp5;
604cabdff1aSopenharmony_ci    reg14 = tmp6;
605cabdff1aSopenharmony_ci    reg3 = tmp7;
606cabdff1aSopenharmony_ci
607cabdff1aSopenharmony_ci    /* transpose block */
608cabdff1aSopenharmony_ci    LSX_TRANSPOSE8x8_H(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
609cabdff1aSopenharmony_ci                       reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
610cabdff1aSopenharmony_ci
611cabdff1aSopenharmony_ci    __lsx_vst(reg0, output, 32*0);
612cabdff1aSopenharmony_ci    __lsx_vst(reg2, output, 32*1);
613cabdff1aSopenharmony_ci    __lsx_vst(reg4, output, 32*2);
614cabdff1aSopenharmony_ci    __lsx_vst(reg6, output, 32*3);
615cabdff1aSopenharmony_ci    __lsx_vst(reg8, output, 32*4);
616cabdff1aSopenharmony_ci    __lsx_vst(reg10, output, 32*5);
617cabdff1aSopenharmony_ci    __lsx_vst(reg12, output, 32*6);
618cabdff1aSopenharmony_ci    __lsx_vst(reg14, output, 32*7);
619cabdff1aSopenharmony_ci
620cabdff1aSopenharmony_ci    /* transpose block */
621cabdff1aSopenharmony_ci    LSX_TRANSPOSE8x8_H(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
622cabdff1aSopenharmony_ci                       reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
623cabdff1aSopenharmony_ci
624cabdff1aSopenharmony_ci    offset = output + 8;
625cabdff1aSopenharmony_ci    __lsx_vst(reg3, offset, 32*0);
626cabdff1aSopenharmony_ci    __lsx_vst(reg13, offset, 32*1);
627cabdff1aSopenharmony_ci    __lsx_vst(reg11, offset, 32*2);
628cabdff1aSopenharmony_ci    __lsx_vst(reg5, offset, 32*3);
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_ci    offset = output + 8 + 4 * 16;
631cabdff1aSopenharmony_ci    __lsx_vst(reg7, offset, 32*0);
632cabdff1aSopenharmony_ci    __lsx_vst(reg9, offset, 32*1);
633cabdff1aSopenharmony_ci    __lsx_vst(reg1, offset, 32*2);
634cabdff1aSopenharmony_ci    __lsx_vst(reg15, offset, 32*3);
635cabdff1aSopenharmony_ci}
636cabdff1aSopenharmony_ci
637cabdff1aSopenharmony_cistatic void vp9_idct16x16_1_add_lsx(int16_t *input, uint8_t *dst,
638cabdff1aSopenharmony_ci                                    int32_t dst_stride)
639cabdff1aSopenharmony_ci{
640cabdff1aSopenharmony_ci    uint8_t i;
641cabdff1aSopenharmony_ci    int16_t out;
642cabdff1aSopenharmony_ci    __m128i vec, res0, res1, res2, res3, res4, res5, res6, res7;
643cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
644cabdff1aSopenharmony_ci    int32_t stride2 = dst_stride << 1;
645cabdff1aSopenharmony_ci    int32_t stride3 = stride2 + dst_stride;
646cabdff1aSopenharmony_ci    int32_t stride4 = stride2 << 1;
647cabdff1aSopenharmony_ci
648cabdff1aSopenharmony_ci    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
649cabdff1aSopenharmony_ci    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
650cabdff1aSopenharmony_ci    out = ROUND_POWER_OF_TWO(out, 6);
651cabdff1aSopenharmony_ci    input[0] = 0;
652cabdff1aSopenharmony_ci    vec = __lsx_vreplgr2vr_h(out);
653cabdff1aSopenharmony_ci
654cabdff1aSopenharmony_ci    for (i = 4; i--;) {
655cabdff1aSopenharmony_ci        dst0 = __lsx_vld(dst, 0);
656cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, stride2, dst1, dst2);
657cabdff1aSopenharmony_ci        dst3 = __lsx_vldx(dst, stride3);
658cabdff1aSopenharmony_ci        VP9_UNPCK_UB_SH(dst0, res4, res0);
659cabdff1aSopenharmony_ci        VP9_UNPCK_UB_SH(dst1, res5, res1);
660cabdff1aSopenharmony_ci        VP9_UNPCK_UB_SH(dst2, res6, res2);
661cabdff1aSopenharmony_ci        VP9_UNPCK_UB_SH(dst3, res7, res3);
662cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec,
663cabdff1aSopenharmony_ci                  res0, res1, res2, res3);
664cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec,
665cabdff1aSopenharmony_ci                  res4, res5, res6, res7);
666cabdff1aSopenharmony_ci        DUP4_ARG1(__lsx_vclip255_h, res0, res1, res2, res3,
667cabdff1aSopenharmony_ci                  res0, res1, res2, res3);
668cabdff1aSopenharmony_ci        DUP4_ARG1(__lsx_vclip255_h, res4, res5, res6, res7,
669cabdff1aSopenharmony_ci                  res4, res5, res6, res7);
670cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vpickev_b, res4, res0, res5, res1, res6,
671cabdff1aSopenharmony_ci                  res2, res7, res3, tmp0, tmp1, tmp2, tmp3);
672cabdff1aSopenharmony_ci        __lsx_vst(tmp0, dst, 0);
673cabdff1aSopenharmony_ci        __lsx_vstx(tmp1, dst, dst_stride);
674cabdff1aSopenharmony_ci        __lsx_vstx(tmp2, dst, stride2);
675cabdff1aSopenharmony_ci        __lsx_vstx(tmp3, dst, stride3);
676cabdff1aSopenharmony_ci        dst += stride4;
677cabdff1aSopenharmony_ci    }
678cabdff1aSopenharmony_ci}
679cabdff1aSopenharmony_ci
680cabdff1aSopenharmony_cistatic void vp9_idct16x16_10_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
681cabdff1aSopenharmony_ci                                               int32_t dst_stride)
682cabdff1aSopenharmony_ci{
683cabdff1aSopenharmony_ci    int32_t i;
684cabdff1aSopenharmony_ci    int16_t out_arr[16 * 16] ALLOC_ALIGNED(16);
685cabdff1aSopenharmony_ci    int16_t *out = out_arr;
686cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
687cabdff1aSopenharmony_ci
688cabdff1aSopenharmony_ci    /* transform rows */
689cabdff1aSopenharmony_ci    vp9_idct16_1d_columns_lsx(input, out);
690cabdff1aSopenharmony_ci
691cabdff1aSopenharmony_ci    /* short case just considers top 4 rows as valid output */
692cabdff1aSopenharmony_ci    out += 4 * 16;
693cabdff1aSopenharmony_ci    for (i = 3; i--;) {
694cabdff1aSopenharmony_ci        __lsx_vst(zero, out, 0);
695cabdff1aSopenharmony_ci        __lsx_vst(zero, out, 16);
696cabdff1aSopenharmony_ci        __lsx_vst(zero, out, 32);
697cabdff1aSopenharmony_ci        __lsx_vst(zero, out, 48);
698cabdff1aSopenharmony_ci        __lsx_vst(zero, out, 64);
699cabdff1aSopenharmony_ci        __lsx_vst(zero, out, 80);
700cabdff1aSopenharmony_ci        __lsx_vst(zero, out, 96);
701cabdff1aSopenharmony_ci        __lsx_vst(zero, out, 112);
702cabdff1aSopenharmony_ci        out += 64;
703cabdff1aSopenharmony_ci    }
704cabdff1aSopenharmony_ci
705cabdff1aSopenharmony_ci    out = out_arr;
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_ci    /* transform columns */
708cabdff1aSopenharmony_ci    for (i = 0; i < 2; i++) {
709cabdff1aSopenharmony_ci        /* process 8 * 16 block */
710cabdff1aSopenharmony_ci        vp9_idct16_1d_columns_addblk_lsx((out + (i << 3)), (dst + (i << 3)),
711cabdff1aSopenharmony_ci                                         dst_stride);
712cabdff1aSopenharmony_ci    }
713cabdff1aSopenharmony_ci}
714cabdff1aSopenharmony_ci
715cabdff1aSopenharmony_cistatic void vp9_idct16x16_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
716cabdff1aSopenharmony_ci                                            int32_t dst_stride)
717cabdff1aSopenharmony_ci{
718cabdff1aSopenharmony_ci    int32_t i;
719cabdff1aSopenharmony_ci    int16_t out_arr[16 * 16] ALLOC_ALIGNED(16);
720cabdff1aSopenharmony_ci    int16_t *out = out_arr;
721cabdff1aSopenharmony_ci
722cabdff1aSopenharmony_ci    /* transform rows */
723cabdff1aSopenharmony_ci    for (i = 0; i < 2; i++) {
724cabdff1aSopenharmony_ci        /* process 8 * 16 block */
725cabdff1aSopenharmony_ci        vp9_idct16_1d_columns_lsx((input + (i << 3)), (out + (i << 7)));
726cabdff1aSopenharmony_ci    }
727cabdff1aSopenharmony_ci
728cabdff1aSopenharmony_ci    /* transform columns */
729cabdff1aSopenharmony_ci    for (i = 0; i < 2; i++) {
730cabdff1aSopenharmony_ci        /* process 8 * 16 block */
731cabdff1aSopenharmony_ci        vp9_idct16_1d_columns_addblk_lsx((out + (i << 3)), (dst + (i << 3)),
732cabdff1aSopenharmony_ci                                         dst_stride);
733cabdff1aSopenharmony_ci    }
734cabdff1aSopenharmony_ci}
735cabdff1aSopenharmony_ci
736cabdff1aSopenharmony_cistatic void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf,
737cabdff1aSopenharmony_ci                                               int16_t *tmp_eve_buf,
738cabdff1aSopenharmony_ci                                               int16_t *tmp_odd_buf,
739cabdff1aSopenharmony_ci                                               int16_t *dst)
740cabdff1aSopenharmony_ci{
741cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
742cabdff1aSopenharmony_ci    __m128i m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
743cabdff1aSopenharmony_ci
744cabdff1aSopenharmony_ci    /* FINAL BUTTERFLY : Dependency on Even & Odd */
745cabdff1aSopenharmony_ci    vec0 = __lsx_vld(tmp_odd_buf, 0);
746cabdff1aSopenharmony_ci    vec1 = __lsx_vld(tmp_odd_buf, 9 * 16);
747cabdff1aSopenharmony_ci    vec2 = __lsx_vld(tmp_odd_buf, 14 * 16);
748cabdff1aSopenharmony_ci    vec3 = __lsx_vld(tmp_odd_buf, 6 * 16);
749cabdff1aSopenharmony_ci    loc0 = __lsx_vld(tmp_eve_buf, 0);
750cabdff1aSopenharmony_ci    loc1 = __lsx_vld(tmp_eve_buf, 8 * 16);
751cabdff1aSopenharmony_ci    loc2 = __lsx_vld(tmp_eve_buf, 4 * 16);
752cabdff1aSopenharmony_ci    loc3 = __lsx_vld(tmp_eve_buf, 12 * 16);
753cabdff1aSopenharmony_ci
754cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h,loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
755cabdff1aSopenharmony_ci              m0, m4, m2, m6);
756cabdff1aSopenharmony_ci
757cabdff1aSopenharmony_ci    #define SUB(a, b) __lsx_vsub_h(a, b)
758cabdff1aSopenharmony_ci
759cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc0, vec3), tmp_buf, 31 * 16);
760cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc1, vec2), tmp_buf, 23 * 16);
761cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc2, vec1), tmp_buf, 27 * 16);
762cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc3, vec0), tmp_buf, 19 * 16);
763cabdff1aSopenharmony_ci
764cabdff1aSopenharmony_ci    /* Load 8 & Store 8 */
765cabdff1aSopenharmony_ci    vec0 = __lsx_vld(tmp_odd_buf, 4 * 16);
766cabdff1aSopenharmony_ci    vec1 = __lsx_vld(tmp_odd_buf, 13 * 16);
767cabdff1aSopenharmony_ci    vec2 = __lsx_vld(tmp_odd_buf, 10 * 16);
768cabdff1aSopenharmony_ci    vec3 = __lsx_vld(tmp_odd_buf, 3 * 16);
769cabdff1aSopenharmony_ci    loc0 = __lsx_vld(tmp_eve_buf, 2 * 16);
770cabdff1aSopenharmony_ci    loc1 = __lsx_vld(tmp_eve_buf, 10 * 16);
771cabdff1aSopenharmony_ci    loc2 = __lsx_vld(tmp_eve_buf, 6 * 16);
772cabdff1aSopenharmony_ci    loc3 = __lsx_vld(tmp_eve_buf, 14 * 16);
773cabdff1aSopenharmony_ci
774cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
775cabdff1aSopenharmony_ci              m1, m5, m3, m7);
776cabdff1aSopenharmony_ci
777cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc0, vec3), tmp_buf, 29 * 16);
778cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc1, vec2), tmp_buf, 21 * 16);
779cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc2, vec1), tmp_buf, 25 * 16);
780cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc3, vec0), tmp_buf, 17 * 16);
781cabdff1aSopenharmony_ci
782cabdff1aSopenharmony_ci    /* Load 8 & Store 8 */
783cabdff1aSopenharmony_ci    vec0 = __lsx_vld(tmp_odd_buf, 2 * 16);
784cabdff1aSopenharmony_ci    vec1 = __lsx_vld(tmp_odd_buf, 11 * 16);
785cabdff1aSopenharmony_ci    vec2 = __lsx_vld(tmp_odd_buf, 12 * 16);
786cabdff1aSopenharmony_ci    vec3 = __lsx_vld(tmp_odd_buf, 7 * 16);
787cabdff1aSopenharmony_ci    loc0 = __lsx_vld(tmp_eve_buf, 1 * 16);
788cabdff1aSopenharmony_ci    loc1 = __lsx_vld(tmp_eve_buf, 9 * 16);
789cabdff1aSopenharmony_ci    loc2 = __lsx_vld(tmp_eve_buf, 5 * 16);
790cabdff1aSopenharmony_ci    loc3 = __lsx_vld(tmp_eve_buf, 13 * 16);
791cabdff1aSopenharmony_ci
792cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
793cabdff1aSopenharmony_ci              n0, n4, n2, n6);
794cabdff1aSopenharmony_ci
795cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc0, vec3), tmp_buf, 30 * 16);
796cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc1, vec2), tmp_buf, 22 * 16);
797cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc2, vec1), tmp_buf, 26 * 16);
798cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc3, vec0), tmp_buf, 18 * 16);
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci    /* Load 8 & Store 8 */
801cabdff1aSopenharmony_ci    vec0 = __lsx_vld(tmp_odd_buf, 5 * 16);
802cabdff1aSopenharmony_ci    vec1 = __lsx_vld(tmp_odd_buf, 15 * 16);
803cabdff1aSopenharmony_ci    vec2 = __lsx_vld(tmp_odd_buf, 8 * 16);
804cabdff1aSopenharmony_ci    vec3 = __lsx_vld(tmp_odd_buf, 1 * 16);
805cabdff1aSopenharmony_ci    loc0 = __lsx_vld(tmp_eve_buf, 3 * 16);
806cabdff1aSopenharmony_ci    loc1 = __lsx_vld(tmp_eve_buf, 11 * 16);
807cabdff1aSopenharmony_ci    loc2 = __lsx_vld(tmp_eve_buf, 7 * 16);
808cabdff1aSopenharmony_ci    loc3 = __lsx_vld(tmp_eve_buf, 15 * 16);
809cabdff1aSopenharmony_ci
810cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
811cabdff1aSopenharmony_ci              n1, n5, n3, n7);
812cabdff1aSopenharmony_ci
813cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc0, vec3), tmp_buf, 28 * 16);
814cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc1, vec2), tmp_buf, 20 * 16);
815cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc2, vec1), tmp_buf, 24 * 16);
816cabdff1aSopenharmony_ci    __lsx_vst(SUB(loc3, vec0), tmp_buf, 16 * 16);
817cabdff1aSopenharmony_ci
818cabdff1aSopenharmony_ci    /* Transpose : 16 vectors */
819cabdff1aSopenharmony_ci    /* 1st & 2nd 8x8 */
820cabdff1aSopenharmony_ci    LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3,
821cabdff1aSopenharmony_ci                       m0, n0, m1, n1, m2, n2, m3, n3);
822cabdff1aSopenharmony_ci    __lsx_vst(m0, dst, 0);
823cabdff1aSopenharmony_ci    __lsx_vst(n0, dst, 32 * 2);
824cabdff1aSopenharmony_ci    __lsx_vst(m1, dst, 32 * 4);
825cabdff1aSopenharmony_ci    __lsx_vst(n1, dst, 32 * 6);
826cabdff1aSopenharmony_ci    __lsx_vst(m2, dst, 32 * 8);
827cabdff1aSopenharmony_ci    __lsx_vst(n2, dst, 32 * 10);
828cabdff1aSopenharmony_ci    __lsx_vst(m3, dst, 32 * 12);
829cabdff1aSopenharmony_ci    __lsx_vst(n3, dst, 32 * 14);
830cabdff1aSopenharmony_ci
831cabdff1aSopenharmony_ci    LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7,
832cabdff1aSopenharmony_ci                       m4, n4, m5, n5, m6, n6, m7, n7);
833cabdff1aSopenharmony_ci
834cabdff1aSopenharmony_ci    __lsx_vst(m4, dst, 16);
835cabdff1aSopenharmony_ci    __lsx_vst(n4, dst, 16 + 32 * 2);
836cabdff1aSopenharmony_ci    __lsx_vst(m5, dst, 16 + 32 * 4);
837cabdff1aSopenharmony_ci    __lsx_vst(n5, dst, 16 + 32 * 6);
838cabdff1aSopenharmony_ci    __lsx_vst(m6, dst, 16 + 32 * 8);
839cabdff1aSopenharmony_ci    __lsx_vst(n6, dst, 16 + 32 * 10);
840cabdff1aSopenharmony_ci    __lsx_vst(m7, dst, 16 + 32 * 12);
841cabdff1aSopenharmony_ci    __lsx_vst(n7, dst, 16 + 32 * 14);
842cabdff1aSopenharmony_ci
843cabdff1aSopenharmony_ci    /* 3rd & 4th 8x8 */
844cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 16, tmp_buf, 16 * 17,
845cabdff1aSopenharmony_ci              tmp_buf, 16 * 18, tmp_buf, 16 * 19, m0, n0, m1, n1);
846cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 20, tmp_buf, 16 * 21,
847cabdff1aSopenharmony_ci              tmp_buf, 16 * 22, tmp_buf, 16 * 23, m2, n2, m3, n3);
848cabdff1aSopenharmony_ci
849cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 24, tmp_buf, 16 * 25,
850cabdff1aSopenharmony_ci              tmp_buf, 16 * 26, tmp_buf, 16 * 27, m4, n4, m5, n5);
851cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 28, tmp_buf, 16 * 29,
852cabdff1aSopenharmony_ci              tmp_buf, 16 * 30, tmp_buf, 16 * 31, m6, n6, m7, n7);
853cabdff1aSopenharmony_ci
854cabdff1aSopenharmony_ci    LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3,
855cabdff1aSopenharmony_ci                       m0, n0, m1, n1, m2, n2, m3, n3);
856cabdff1aSopenharmony_ci
857cabdff1aSopenharmony_ci    __lsx_vst(m0, dst, 32);
858cabdff1aSopenharmony_ci    __lsx_vst(n0, dst, 32 + 32 * 2);
859cabdff1aSopenharmony_ci    __lsx_vst(m1, dst, 32 + 32 * 4);
860cabdff1aSopenharmony_ci    __lsx_vst(n1, dst, 32 + 32 * 6);
861cabdff1aSopenharmony_ci    __lsx_vst(m2, dst, 32 + 32 * 8);
862cabdff1aSopenharmony_ci    __lsx_vst(n2, dst, 32 + 32 * 10);
863cabdff1aSopenharmony_ci    __lsx_vst(m3, dst, 32 + 32 * 12);
864cabdff1aSopenharmony_ci    __lsx_vst(n3, dst, 32 + 32 * 14);
865cabdff1aSopenharmony_ci
866cabdff1aSopenharmony_ci    LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7,
867cabdff1aSopenharmony_ci                       m4, n4, m5, n5, m6, n6, m7, n7);
868cabdff1aSopenharmony_ci
869cabdff1aSopenharmony_ci    __lsx_vst(m4, dst, 48);
870cabdff1aSopenharmony_ci    __lsx_vst(n4, dst, 48 + 32 * 2);
871cabdff1aSopenharmony_ci    __lsx_vst(m5, dst, 48 + 32 * 4);
872cabdff1aSopenharmony_ci    __lsx_vst(n5, dst, 48 + 32 * 6);
873cabdff1aSopenharmony_ci    __lsx_vst(m6, dst, 48 + 32 * 8);
874cabdff1aSopenharmony_ci    __lsx_vst(n6, dst, 48 + 32 * 10);
875cabdff1aSopenharmony_ci    __lsx_vst(m7, dst, 48 + 32 * 12);
876cabdff1aSopenharmony_ci    __lsx_vst(n7, dst, 48 + 32 * 14);
877cabdff1aSopenharmony_ci}
878cabdff1aSopenharmony_ci
879cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
880cabdff1aSopenharmony_ci                                                   int16_t *tmp_eve_buf)
881cabdff1aSopenharmony_ci{
882cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
883cabdff1aSopenharmony_ci    __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
884cabdff1aSopenharmony_ci    __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
885cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
886cabdff1aSopenharmony_ci
887cabdff1aSopenharmony_ci    /* Even stage 1 */
888cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 32 * 8,
889cabdff1aSopenharmony_ci              tmp_buf, 32 * 16, tmp_buf, 32 * 24, reg0, reg1, reg2, reg3);
890cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_buf, 32 * 32, tmp_buf, 32 * 40,
891cabdff1aSopenharmony_ci              tmp_buf, 32 * 48, tmp_buf, 32 * 56, reg4, reg5, reg6, reg7);
892cabdff1aSopenharmony_ci
893cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 0);
894cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 8);
895cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 16);
896cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 24);
897cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 32);
898cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 40);
899cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 48);
900cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 56);
901cabdff1aSopenharmony_ci
902cabdff1aSopenharmony_ci    tmp_buf += (2 * 32);
903cabdff1aSopenharmony_ci
904cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
905cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
906cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
907cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
908cabdff1aSopenharmony_ci
909cabdff1aSopenharmony_ci    loc1 = vec3;
910cabdff1aSopenharmony_ci    loc0 = vec1;
911cabdff1aSopenharmony_ci
912cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
913cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
914cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
915cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
916cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
917cabdff1aSopenharmony_ci
918cabdff1aSopenharmony_ci    /* Even stage 2 */
919cabdff1aSopenharmony_ci    /* Load 8 */
920cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 32 * 8,
921cabdff1aSopenharmony_ci              tmp_buf, 32 * 16, tmp_buf, 32 * 24, reg0, reg1, reg2, reg3);
922cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_buf, 32 * 32, tmp_buf, 32 * 40,
923cabdff1aSopenharmony_ci              tmp_buf, 32 * 48, tmp_buf, 32 * 56, reg4, reg5, reg6, reg7);
924cabdff1aSopenharmony_ci
925cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 0);
926cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 8);
927cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 16);
928cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 24);
929cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 32);
930cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 40);
931cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 48);
932cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 32 * 56);
933cabdff1aSopenharmony_ci
934cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
935cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
936cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
937cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
938cabdff1aSopenharmony_ci
939cabdff1aSopenharmony_ci    vec0 = __lsx_vadd_h(reg0, reg4);
940cabdff1aSopenharmony_ci    reg0 = __lsx_vsub_h(reg0, reg4);
941cabdff1aSopenharmony_ci    reg4 = __lsx_vadd_h(reg6, reg2);
942cabdff1aSopenharmony_ci    reg6 = __lsx_vsub_h(reg6, reg2);
943cabdff1aSopenharmony_ci    reg2 = __lsx_vadd_h(reg1, reg5);
944cabdff1aSopenharmony_ci    reg1 = __lsx_vsub_h(reg1, reg5);
945cabdff1aSopenharmony_ci    reg5 = __lsx_vadd_h(reg7, reg3);
946cabdff1aSopenharmony_ci    reg7 = __lsx_vsub_h(reg7, reg3);
947cabdff1aSopenharmony_ci    reg3 = vec0;
948cabdff1aSopenharmony_ci
949cabdff1aSopenharmony_ci    vec1 = reg2;
950cabdff1aSopenharmony_ci    reg2 = __lsx_vadd_h(reg3, reg4);
951cabdff1aSopenharmony_ci    reg3 = __lsx_vsub_h(reg3, reg4);
952cabdff1aSopenharmony_ci    reg4 = __lsx_vsub_h(reg5, vec1);
953cabdff1aSopenharmony_ci    reg5 = __lsx_vadd_h(reg5, vec1);
954cabdff1aSopenharmony_ci
955cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
956cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg6), reg1, cospi_24_64, cospi_8_64,
957cabdff1aSopenharmony_ci                        reg6, reg1);
958cabdff1aSopenharmony_ci
959cabdff1aSopenharmony_ci    vec0 = __lsx_vsub_h(reg0, reg6);
960cabdff1aSopenharmony_ci    reg0 = __lsx_vadd_h(reg0, reg6);
961cabdff1aSopenharmony_ci    vec1 = __lsx_vsub_h(reg7, reg1);
962cabdff1aSopenharmony_ci    reg7 = __lsx_vadd_h(reg7, reg1);
963cabdff1aSopenharmony_ci
964cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
965cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
966cabdff1aSopenharmony_ci
967cabdff1aSopenharmony_ci    /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
968cabdff1aSopenharmony_ci    /* Store 8 */
969cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
970cabdff1aSopenharmony_ci    __lsx_vst(loc1, tmp_eve_buf, 0);
971cabdff1aSopenharmony_ci    __lsx_vst(loc3, tmp_eve_buf, 16);
972cabdff1aSopenharmony_ci    __lsx_vst(loc2, tmp_eve_buf, 14 * 16);
973cabdff1aSopenharmony_ci    __lsx_vst(loc0, tmp_eve_buf, 14 * 16 + 16);
974cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
975cabdff1aSopenharmony_ci    __lsx_vst(loc1, tmp_eve_buf, 2 * 16);
976cabdff1aSopenharmony_ci    __lsx_vst(loc3, tmp_eve_buf, 2 * 16 + 16);
977cabdff1aSopenharmony_ci    __lsx_vst(loc2, tmp_eve_buf, 12 * 16);
978cabdff1aSopenharmony_ci    __lsx_vst(loc0, tmp_eve_buf, 12 * 16 + 16);
979cabdff1aSopenharmony_ci
980cabdff1aSopenharmony_ci    /* Store 8 */
981cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
982cabdff1aSopenharmony_ci    __lsx_vst(loc1, tmp_eve_buf, 4 * 16);
983cabdff1aSopenharmony_ci    __lsx_vst(loc3, tmp_eve_buf, 4 * 16 + 16);
984cabdff1aSopenharmony_ci    __lsx_vst(loc2, tmp_eve_buf, 10 * 16);
985cabdff1aSopenharmony_ci    __lsx_vst(loc0, tmp_eve_buf, 10 * 16 + 16);
986cabdff1aSopenharmony_ci
987cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
988cabdff1aSopenharmony_ci    __lsx_vst(loc1, tmp_eve_buf, 6 * 16);
989cabdff1aSopenharmony_ci    __lsx_vst(loc3, tmp_eve_buf, 6 * 16 + 16);
990cabdff1aSopenharmony_ci    __lsx_vst(loc2, tmp_eve_buf, 8 * 16);
991cabdff1aSopenharmony_ci    __lsx_vst(loc0, tmp_eve_buf, 8 * 16 + 16);
992cabdff1aSopenharmony_ci}
993cabdff1aSopenharmony_ci
994cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
995cabdff1aSopenharmony_ci                                                  int16_t *tmp_odd_buf)
996cabdff1aSopenharmony_ci{
997cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
998cabdff1aSopenharmony_ci    __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
999cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
1000cabdff1aSopenharmony_ci
1001cabdff1aSopenharmony_ci    /* Odd stage 1 */
1002cabdff1aSopenharmony_ci    reg0 = __lsx_vld(tmp_buf, 64);
1003cabdff1aSopenharmony_ci    reg1 = __lsx_vld(tmp_buf, 7 * 64);
1004cabdff1aSopenharmony_ci    reg2 = __lsx_vld(tmp_buf, 9 * 64);
1005cabdff1aSopenharmony_ci    reg3 = __lsx_vld(tmp_buf, 15 * 64);
1006cabdff1aSopenharmony_ci    reg4 = __lsx_vld(tmp_buf, 17 * 64);
1007cabdff1aSopenharmony_ci    reg5 = __lsx_vld(tmp_buf, 23 * 64);
1008cabdff1aSopenharmony_ci    reg6 = __lsx_vld(tmp_buf, 25 * 64);
1009cabdff1aSopenharmony_ci    reg7 = __lsx_vld(tmp_buf, 31 * 64);
1010cabdff1aSopenharmony_ci
1011cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 64);
1012cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 7 * 64);
1013cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 9 * 64);
1014cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 15 * 64);
1015cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 17 * 64);
1016cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 23 * 64);
1017cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 25 * 64);
1018cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 31 * 64);
1019cabdff1aSopenharmony_ci
1020cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
1021cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
1022cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
1023cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
1024cabdff1aSopenharmony_ci
1025cabdff1aSopenharmony_ci    vec0 = __lsx_vadd_h(reg0, reg3);
1026cabdff1aSopenharmony_ci    reg0 = __lsx_vsub_h(reg0, reg3);
1027cabdff1aSopenharmony_ci    reg3 = __lsx_vadd_h(reg7, reg4);
1028cabdff1aSopenharmony_ci    reg7 = __lsx_vsub_h(reg7, reg4);
1029cabdff1aSopenharmony_ci    reg4 = __lsx_vadd_h(reg1, reg2);
1030cabdff1aSopenharmony_ci    reg1 = __lsx_vsub_h(reg1, reg2);
1031cabdff1aSopenharmony_ci    reg2 = __lsx_vadd_h(reg6, reg5);
1032cabdff1aSopenharmony_ci    reg6 = __lsx_vsub_h(reg6, reg5);
1033cabdff1aSopenharmony_ci    reg5 = vec0;
1034cabdff1aSopenharmony_ci
1035cabdff1aSopenharmony_ci    /* 4 Stores */
1036cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
1037cabdff1aSopenharmony_ci    __lsx_vst(vec0, tmp_odd_buf, 4 * 16);
1038cabdff1aSopenharmony_ci    __lsx_vst(vec1, tmp_odd_buf, 4 * 16 + 16);
1039cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
1040cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
1041cabdff1aSopenharmony_ci    __lsx_vst(vec0, tmp_odd_buf, 0);
1042cabdff1aSopenharmony_ci    __lsx_vst(vec1, tmp_odd_buf, 16);
1043cabdff1aSopenharmony_ci
1044cabdff1aSopenharmony_ci    /* 4 Stores */
1045cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
1046cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
1047cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
1048cabdff1aSopenharmony_ci    __lsx_vst(vec0, tmp_odd_buf, 6 * 16);
1049cabdff1aSopenharmony_ci    __lsx_vst(vec1, tmp_odd_buf, 6 * 16 + 16);
1050cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
1051cabdff1aSopenharmony_ci    __lsx_vst(vec2, tmp_odd_buf, 2 * 16);
1052cabdff1aSopenharmony_ci    __lsx_vst(vec3, tmp_odd_buf, 2 * 16 + 16);
1053cabdff1aSopenharmony_ci
1054cabdff1aSopenharmony_ci    /* Odd stage 2 */
1055cabdff1aSopenharmony_ci    /* 8 loads */
1056cabdff1aSopenharmony_ci    reg0 = __lsx_vld(tmp_buf, 3 * 64);
1057cabdff1aSopenharmony_ci    reg1 = __lsx_vld(tmp_buf, 5 * 64);
1058cabdff1aSopenharmony_ci    reg2 = __lsx_vld(tmp_buf, 11 * 64);
1059cabdff1aSopenharmony_ci    reg3 = __lsx_vld(tmp_buf, 13 * 64);
1060cabdff1aSopenharmony_ci    reg4 = __lsx_vld(tmp_buf, 19 * 64);
1061cabdff1aSopenharmony_ci    reg5 = __lsx_vld(tmp_buf, 21 * 64);
1062cabdff1aSopenharmony_ci    reg6 = __lsx_vld(tmp_buf, 27 * 64);
1063cabdff1aSopenharmony_ci    reg7 = __lsx_vld(tmp_buf, 29 * 64);
1064cabdff1aSopenharmony_ci
1065cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 3 * 64);
1066cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 5 * 64);
1067cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 11 * 64);
1068cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 13 * 64);
1069cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 19 * 64);
1070cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 21 * 64);
1071cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 27 * 64);
1072cabdff1aSopenharmony_ci    __lsx_vst(zero, tmp_buf, 29 * 64);
1073cabdff1aSopenharmony_ci
1074cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
1075cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
1076cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
1077cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
1078cabdff1aSopenharmony_ci
1079cabdff1aSopenharmony_ci    /* 4 Stores */
1080cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsub_h,reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
1081cabdff1aSopenharmony_ci              vec0, vec1, vec2, vec3);
1082cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
1083cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
1084cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
1085cabdff1aSopenharmony_ci    __lsx_vst(vec0, tmp_odd_buf, 12 * 16);
1086cabdff1aSopenharmony_ci    __lsx_vst(vec1, tmp_odd_buf, 12 * 16 + 3 * 16);
1087cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
1088cabdff1aSopenharmony_ci    __lsx_vst(vec0, tmp_odd_buf, 10 * 16);
1089cabdff1aSopenharmony_ci    __lsx_vst(vec1, tmp_odd_buf, 10 * 16 + 16);
1090cabdff1aSopenharmony_ci
1091cabdff1aSopenharmony_ci    /* 4 Stores */
1092cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7,
1093cabdff1aSopenharmony_ci              vec0, vec1, vec2, vec3);
1094cabdff1aSopenharmony_ci    LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
1095cabdff1aSopenharmony_ci    __lsx_vst(reg0, tmp_odd_buf, 13 * 16);
1096cabdff1aSopenharmony_ci    __lsx_vst(reg1, tmp_odd_buf, 13 * 16 + 16);
1097cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64,
1098cabdff1aSopenharmony_ci                        reg0, reg1);
1099cabdff1aSopenharmony_ci    __lsx_vst(reg0, tmp_odd_buf, 8 * 16);
1100cabdff1aSopenharmony_ci    __lsx_vst(reg1, tmp_odd_buf, 8 * 16 + 16);
1101cabdff1aSopenharmony_ci
1102cabdff1aSopenharmony_ci    /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
1103cabdff1aSopenharmony_ci    /* Load 8 & Store 8 */
1104cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16,
1105cabdff1aSopenharmony_ci              tmp_odd_buf, 32, tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
1106cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_odd_buf, 8 * 16, tmp_odd_buf, 8 * 16 + 16,
1107cabdff1aSopenharmony_ci              tmp_odd_buf, 8 * 16 + 32, tmp_odd_buf, 8 * 16 + 48,
1108cabdff1aSopenharmony_ci              reg4, reg5, reg6, reg7);
1109cabdff1aSopenharmony_ci
1110cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
1111cabdff1aSopenharmony_ci                  loc0, loc1, loc2, loc3);
1112cabdff1aSopenharmony_ci    __lsx_vst(loc0, tmp_odd_buf, 0);
1113cabdff1aSopenharmony_ci    __lsx_vst(loc1, tmp_odd_buf, 16);
1114cabdff1aSopenharmony_ci    __lsx_vst(loc2, tmp_odd_buf, 32);
1115cabdff1aSopenharmony_ci    __lsx_vst(loc3, tmp_odd_buf, 48);
1116cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
1117cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
1118cabdff1aSopenharmony_ci
1119cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
1120cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
1121cabdff1aSopenharmony_ci    __lsx_vst(loc0, tmp_odd_buf, 8 * 16);
1122cabdff1aSopenharmony_ci    __lsx_vst(loc1, tmp_odd_buf, 8 * 16 + 16);
1123cabdff1aSopenharmony_ci    __lsx_vst(loc2, tmp_odd_buf, 8 * 16 + 32);
1124cabdff1aSopenharmony_ci    __lsx_vst(loc3, tmp_odd_buf, 8 * 16 + 48);
1125cabdff1aSopenharmony_ci
1126cabdff1aSopenharmony_ci    /* Load 8 & Store 8 */
1127cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_odd_buf, 4 * 16, tmp_odd_buf, 4 * 16 + 16,
1128cabdff1aSopenharmony_ci              tmp_odd_buf, 4 * 16 + 32, tmp_odd_buf, 4 * 16 + 48,
1129cabdff1aSopenharmony_ci              reg1, reg2, reg0, reg3);
1130cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vld, tmp_odd_buf, 12 * 16, tmp_odd_buf, 12 * 16 + 16,
1131cabdff1aSopenharmony_ci              tmp_odd_buf, 12 * 16 + 32, tmp_odd_buf, 12 * 16 + 48,
1132cabdff1aSopenharmony_ci              reg4, reg5, reg6, reg7);
1133cabdff1aSopenharmony_ci
1134cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
1135cabdff1aSopenharmony_ci              loc0, loc1, loc2, loc3);
1136cabdff1aSopenharmony_ci    __lsx_vst(loc0, tmp_odd_buf, 4 * 16);
1137cabdff1aSopenharmony_ci    __lsx_vst(loc1, tmp_odd_buf, 4 * 16 + 16);
1138cabdff1aSopenharmony_ci    __lsx_vst(loc2, tmp_odd_buf, 4 * 16 + 32);
1139cabdff1aSopenharmony_ci    __lsx_vst(loc3, tmp_odd_buf, 4 * 16 + 48);
1140cabdff1aSopenharmony_ci
1141cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
1142cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
1143cabdff1aSopenharmony_ci
1144cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
1145cabdff1aSopenharmony_ci    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
1146cabdff1aSopenharmony_ci    __lsx_vst(loc0, tmp_odd_buf, 12 * 16);
1147cabdff1aSopenharmony_ci    __lsx_vst(loc1, tmp_odd_buf, 12 * 16 + 16);
1148cabdff1aSopenharmony_ci    __lsx_vst(loc2, tmp_odd_buf, 12 * 16 + 32);
1149cabdff1aSopenharmony_ci    __lsx_vst(loc3, tmp_odd_buf, 12 * 16 + 48);
1150cabdff1aSopenharmony_ci}
1151cabdff1aSopenharmony_ci
1152cabdff1aSopenharmony_cistatic void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
1153cabdff1aSopenharmony_ci                                                 int16_t *tmp_odd_buf,
1154cabdff1aSopenharmony_ci                                                 uint8_t *dst,
1155cabdff1aSopenharmony_ci                                                 int32_t dst_stride)
1156cabdff1aSopenharmony_ci{
1157cabdff1aSopenharmony_ci    __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
1158cabdff1aSopenharmony_ci    __m128i m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
1159cabdff1aSopenharmony_ci
1160cabdff1aSopenharmony_ci    /* FINAL BUTTERFLY : Dependency on Even & Odd */
1161cabdff1aSopenharmony_ci    vec0 = __lsx_vld(tmp_odd_buf, 0);
1162cabdff1aSopenharmony_ci    vec1 = __lsx_vld(tmp_odd_buf, 9 * 16);
1163cabdff1aSopenharmony_ci    vec2 = __lsx_vld(tmp_odd_buf, 14 * 16);
1164cabdff1aSopenharmony_ci    vec3 = __lsx_vld(tmp_odd_buf, 6 * 16);
1165cabdff1aSopenharmony_ci    loc0 = __lsx_vld(tmp_eve_buf, 0);
1166cabdff1aSopenharmony_ci    loc1 = __lsx_vld(tmp_eve_buf, 8 * 16);
1167cabdff1aSopenharmony_ci    loc2 = __lsx_vld(tmp_eve_buf, 4 * 16);
1168cabdff1aSopenharmony_ci    loc3 = __lsx_vld(tmp_eve_buf, 12 * 16);
1169cabdff1aSopenharmony_ci
1170cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1171cabdff1aSopenharmony_ci              m0, m4, m2, m6);
1172cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
1173cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
1174cabdff1aSopenharmony_ci
1175cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1176cabdff1aSopenharmony_ci              m6, m2, m4, m0);
1177cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
1178cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
1179cabdff1aSopenharmony_ci                        m0, m2, m4, m6);
1180cabdff1aSopenharmony_ci
1181cabdff1aSopenharmony_ci    /* Load 8 & Store 8 */
1182cabdff1aSopenharmony_ci    vec0 = __lsx_vld(tmp_odd_buf, 4 * 16);
1183cabdff1aSopenharmony_ci    vec1 = __lsx_vld(tmp_odd_buf, 13 * 16);
1184cabdff1aSopenharmony_ci    vec2 = __lsx_vld(tmp_odd_buf, 10 * 16);
1185cabdff1aSopenharmony_ci    vec3 = __lsx_vld(tmp_odd_buf, 3 * 16);
1186cabdff1aSopenharmony_ci    loc0 = __lsx_vld(tmp_eve_buf, 2 * 16);
1187cabdff1aSopenharmony_ci    loc1 = __lsx_vld(tmp_eve_buf, 10 * 16);
1188cabdff1aSopenharmony_ci    loc2 = __lsx_vld(tmp_eve_buf, 6 * 16);
1189cabdff1aSopenharmony_ci    loc3 = __lsx_vld(tmp_eve_buf, 14 * 16);
1190cabdff1aSopenharmony_ci
1191cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1192cabdff1aSopenharmony_ci               m1, m5, m3, m7);
1193cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
1194cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
1195cabdff1aSopenharmony_ci                        m1, m3, m5, m7);
1196cabdff1aSopenharmony_ci
1197cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1198cabdff1aSopenharmony_ci              m7, m3, m5, m1);
1199cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
1200cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
1201cabdff1aSopenharmony_ci                        m1, m3, m5, m7);
1202cabdff1aSopenharmony_ci
1203cabdff1aSopenharmony_ci    /* Load 8 & Store 8 */
1204cabdff1aSopenharmony_ci    vec0 = __lsx_vld(tmp_odd_buf, 2 * 16);
1205cabdff1aSopenharmony_ci    vec1 = __lsx_vld(tmp_odd_buf, 11 * 16);
1206cabdff1aSopenharmony_ci    vec2 = __lsx_vld(tmp_odd_buf, 12 * 16);
1207cabdff1aSopenharmony_ci    vec3 = __lsx_vld(tmp_odd_buf, 7 * 16);
1208cabdff1aSopenharmony_ci    loc0 = __lsx_vld(tmp_eve_buf, 1 * 16);
1209cabdff1aSopenharmony_ci    loc1 = __lsx_vld(tmp_eve_buf, 9 * 16);
1210cabdff1aSopenharmony_ci    loc2 = __lsx_vld(tmp_eve_buf, 5 * 16);
1211cabdff1aSopenharmony_ci    loc3 = __lsx_vld(tmp_eve_buf, 13 * 16);
1212cabdff1aSopenharmony_ci
1213cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1214cabdff1aSopenharmony_ci              n0, n4, n2, n6);
1215cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
1216cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
1217cabdff1aSopenharmony_ci                        n0, n2, n4, n6);
1218cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1219cabdff1aSopenharmony_ci              n6, n2, n4, n0);
1220cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
1221cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
1222cabdff1aSopenharmony_ci                        n0, n2, n4, n6);
1223cabdff1aSopenharmony_ci
1224cabdff1aSopenharmony_ci    /* Load 8 & Store 8 */
1225cabdff1aSopenharmony_ci    vec0 = __lsx_vld(tmp_odd_buf, 5 * 16);
1226cabdff1aSopenharmony_ci    vec1 = __lsx_vld(tmp_odd_buf, 15 * 16);
1227cabdff1aSopenharmony_ci    vec2 = __lsx_vld(tmp_odd_buf, 8 * 16);
1228cabdff1aSopenharmony_ci    vec3 = __lsx_vld(tmp_odd_buf, 1 * 16);
1229cabdff1aSopenharmony_ci    loc0 = __lsx_vld(tmp_eve_buf, 3 * 16);
1230cabdff1aSopenharmony_ci    loc1 = __lsx_vld(tmp_eve_buf, 11 * 16);
1231cabdff1aSopenharmony_ci    loc2 = __lsx_vld(tmp_eve_buf, 7 * 16);
1232cabdff1aSopenharmony_ci    loc3 = __lsx_vld(tmp_eve_buf, 15 * 16);
1233cabdff1aSopenharmony_ci
1234cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1235cabdff1aSopenharmony_ci              n1, n5, n3, n7);
1236cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
1237cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
1238cabdff1aSopenharmony_ci                        n1, n3, n5, n7);
1239cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1240cabdff1aSopenharmony_ci              n7, n3, n5, n1);
1241cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
1242cabdff1aSopenharmony_ci    VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
1243cabdff1aSopenharmony_ci                        n1, n3, n5, n7);
1244cabdff1aSopenharmony_ci}
1245cabdff1aSopenharmony_ci
1246cabdff1aSopenharmony_cistatic void vp9_idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
1247cabdff1aSopenharmony_ci                                               int32_t dst_stride)
1248cabdff1aSopenharmony_ci{
1249cabdff1aSopenharmony_ci    int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(16);
1250cabdff1aSopenharmony_ci    int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(16);
1251cabdff1aSopenharmony_ci
1252cabdff1aSopenharmony_ci    vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
1253cabdff1aSopenharmony_ci    vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
1254cabdff1aSopenharmony_ci    vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
1255cabdff1aSopenharmony_ci                                         dst, dst_stride);
1256cabdff1aSopenharmony_ci}
1257cabdff1aSopenharmony_ci
1258cabdff1aSopenharmony_cistatic void vp9_idct8x32_1d_columns_lsx(int16_t *input, int16_t *output,
1259cabdff1aSopenharmony_ci                                        int16_t *tmp_buf)
1260cabdff1aSopenharmony_ci{
1261cabdff1aSopenharmony_ci    int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(16);
1262cabdff1aSopenharmony_ci    int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(16);
1263cabdff1aSopenharmony_ci
1264cabdff1aSopenharmony_ci    vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
1265cabdff1aSopenharmony_ci    vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
1266cabdff1aSopenharmony_ci    vp9_idct_butterfly_transpose_store(tmp_buf, &tmp_eve_buf[0],
1267cabdff1aSopenharmony_ci                                       &tmp_odd_buf[0], output);
1268cabdff1aSopenharmony_ci}
1269cabdff1aSopenharmony_ci
1270cabdff1aSopenharmony_cistatic void vp9_idct32x32_1_add_lsx(int16_t *input, uint8_t *dst,
1271cabdff1aSopenharmony_ci                                    int32_t dst_stride)
1272cabdff1aSopenharmony_ci{
1273cabdff1aSopenharmony_ci    int32_t i;
1274cabdff1aSopenharmony_ci    int16_t out;
1275cabdff1aSopenharmony_ci    uint8_t *dst_tmp = dst + dst_stride;
1276cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
1277cabdff1aSopenharmony_ci    __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1278cabdff1aSopenharmony_ci    __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
1279cabdff1aSopenharmony_ci
1280cabdff1aSopenharmony_ci    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
1281cabdff1aSopenharmony_ci    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
1282cabdff1aSopenharmony_ci    out = ROUND_POWER_OF_TWO(out, 6);
1283cabdff1aSopenharmony_ci    input[0] = 0;
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci    vec = __lsx_vreplgr2vr_h(out);
1286cabdff1aSopenharmony_ci
1287cabdff1aSopenharmony_ci    for (i = 16; i--;) {
1288cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
1289cabdff1aSopenharmony_ci        DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst2, dst3);
1290cabdff1aSopenharmony_ci
1291cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
1292cabdff1aSopenharmony_ci                  res0, res1, res2, res3);
1293cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vilvh_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
1294cabdff1aSopenharmony_ci                  res4, res5, res6, res7);
1295cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec,
1296cabdff1aSopenharmony_ci                  res0, res1, res2, res3);
1297cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec,
1298cabdff1aSopenharmony_ci                  res4, res5, res6, res7);
1299cabdff1aSopenharmony_ci        DUP4_ARG1(__lsx_vclip255_h, res0, res1, res2, res3, res0, res1, res2, res3);
1300cabdff1aSopenharmony_ci        DUP4_ARG1(__lsx_vclip255_h, res4, res5, res6, res7, res4, res5, res6, res7);
1301cabdff1aSopenharmony_ci        DUP4_ARG2(__lsx_vpickev_b, res4, res0, res5, res1, res6, res2, res7, res3,
1302cabdff1aSopenharmony_ci                  tmp0, tmp1, tmp2, tmp3);
1303cabdff1aSopenharmony_ci
1304cabdff1aSopenharmony_ci        __lsx_vst(tmp0, dst, 0);
1305cabdff1aSopenharmony_ci        __lsx_vst(tmp1, dst, 16);
1306cabdff1aSopenharmony_ci        __lsx_vst(tmp2, dst_tmp, 0);
1307cabdff1aSopenharmony_ci        __lsx_vst(tmp3, dst_tmp, 16);
1308cabdff1aSopenharmony_ci        dst = dst_tmp + dst_stride;
1309cabdff1aSopenharmony_ci        dst_tmp = dst + dst_stride;
1310cabdff1aSopenharmony_ci    }
1311cabdff1aSopenharmony_ci}
1312cabdff1aSopenharmony_ci
1313cabdff1aSopenharmony_cistatic void vp9_idct32x32_34_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
1314cabdff1aSopenharmony_ci                                               int32_t dst_stride)
1315cabdff1aSopenharmony_ci{
1316cabdff1aSopenharmony_ci    int32_t i;
1317cabdff1aSopenharmony_ci    int16_t out_arr[32 * 32] ALLOC_ALIGNED(16);
1318cabdff1aSopenharmony_ci    int16_t *out_ptr = out_arr;
1319cabdff1aSopenharmony_ci    int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(16);
1320cabdff1aSopenharmony_ci    __m128i zero = __lsx_vldi(0);
1321cabdff1aSopenharmony_ci
1322cabdff1aSopenharmony_ci    for (i = 16; i--;) {
1323cabdff1aSopenharmony_ci        __lsx_vst(zero, out_ptr, 0);
1324cabdff1aSopenharmony_ci        __lsx_vst(zero, out_ptr, 16);
1325cabdff1aSopenharmony_ci        __lsx_vst(zero, out_ptr, 32);
1326cabdff1aSopenharmony_ci        __lsx_vst(zero, out_ptr, 48);
1327cabdff1aSopenharmony_ci        __lsx_vst(zero, out_ptr, 64);
1328cabdff1aSopenharmony_ci        __lsx_vst(zero, out_ptr, 80);
1329cabdff1aSopenharmony_ci        __lsx_vst(zero, out_ptr, 96);
1330cabdff1aSopenharmony_ci        __lsx_vst(zero, out_ptr, 112);
1331cabdff1aSopenharmony_ci        out_ptr += 64;
1332cabdff1aSopenharmony_ci    }
1333cabdff1aSopenharmony_ci
1334cabdff1aSopenharmony_ci    out_ptr = out_arr;
1335cabdff1aSopenharmony_ci
1336cabdff1aSopenharmony_ci    /* process 8*32 block */
1337cabdff1aSopenharmony_ci    vp9_idct8x32_1d_columns_lsx(input, out_ptr, &tmp_buf[0]);
1338cabdff1aSopenharmony_ci
1339cabdff1aSopenharmony_ci    /* transform columns */
1340cabdff1aSopenharmony_ci    for (i = 0; i < 4; i++) {
1341cabdff1aSopenharmony_ci        /* process 8*32 block */
1342cabdff1aSopenharmony_ci        vp9_idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)),
1343cabdff1aSopenharmony_ci                                           (dst + (i << 3)), dst_stride);
1344cabdff1aSopenharmony_ci    }
1345cabdff1aSopenharmony_ci}
1346cabdff1aSopenharmony_ci
1347cabdff1aSopenharmony_cistatic void vp9_idct32x32_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
1348cabdff1aSopenharmony_ci                                            int32_t dst_stride)
1349cabdff1aSopenharmony_ci{
1350cabdff1aSopenharmony_ci    int32_t i;
1351cabdff1aSopenharmony_ci    int16_t out_arr[32 * 32] ALLOC_ALIGNED(16);
1352cabdff1aSopenharmony_ci    int16_t *out_ptr = out_arr;
1353cabdff1aSopenharmony_ci    int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(16);
1354cabdff1aSopenharmony_ci
1355cabdff1aSopenharmony_ci    /* transform rows */
1356cabdff1aSopenharmony_ci    for (i = 0; i < 4; i++) {
1357cabdff1aSopenharmony_ci        /* process 8*32 block */
1358cabdff1aSopenharmony_ci        vp9_idct8x32_1d_columns_lsx((input + (i << 3)), (out_ptr + (i << 8)),
1359cabdff1aSopenharmony_ci                                    &tmp_buf[0]);
1360cabdff1aSopenharmony_ci    }
1361cabdff1aSopenharmony_ci
1362cabdff1aSopenharmony_ci    /* transform columns */
1363cabdff1aSopenharmony_ci    for (i = 0; i < 4; i++) {
1364cabdff1aSopenharmony_ci        /* process 8*32 block */
1365cabdff1aSopenharmony_ci        vp9_idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)),
1366cabdff1aSopenharmony_ci                                           (dst + (i << 3)), dst_stride);
1367cabdff1aSopenharmony_ci    }
1368cabdff1aSopenharmony_ci}
1369cabdff1aSopenharmony_ci
1370cabdff1aSopenharmony_civoid ff_idct_idct_8x8_add_lsx(uint8_t *dst, ptrdiff_t stride,
1371cabdff1aSopenharmony_ci                              int16_t *block, int eob)
1372cabdff1aSopenharmony_ci{
1373cabdff1aSopenharmony_ci    if (eob == 1) {
1374cabdff1aSopenharmony_ci        vp9_idct8x8_1_add_lsx(block, dst, stride);
1375cabdff1aSopenharmony_ci    }
1376cabdff1aSopenharmony_ci    else if (eob <= 12) {
1377cabdff1aSopenharmony_ci        vp9_idct8x8_12_colcol_addblk_lsx(block, dst, stride);
1378cabdff1aSopenharmony_ci    }
1379cabdff1aSopenharmony_ci    else {
1380cabdff1aSopenharmony_ci        vp9_idct8x8_colcol_addblk_lsx(block, dst, stride);
1381cabdff1aSopenharmony_ci    }
1382cabdff1aSopenharmony_ci}
1383cabdff1aSopenharmony_ci
1384cabdff1aSopenharmony_civoid ff_idct_idct_16x16_add_lsx(uint8_t *dst, ptrdiff_t stride,
1385cabdff1aSopenharmony_ci                                int16_t *block, int eob)
1386cabdff1aSopenharmony_ci{
1387cabdff1aSopenharmony_ci    if (eob == 1) {
1388cabdff1aSopenharmony_ci        /* DC only DCT coefficient. */
1389cabdff1aSopenharmony_ci        vp9_idct16x16_1_add_lsx(block, dst, stride);
1390cabdff1aSopenharmony_ci    }
1391cabdff1aSopenharmony_ci    else if (eob <= 10) {
1392cabdff1aSopenharmony_ci        vp9_idct16x16_10_colcol_addblk_lsx(block, dst, stride);
1393cabdff1aSopenharmony_ci    }
1394cabdff1aSopenharmony_ci    else {
1395cabdff1aSopenharmony_ci        vp9_idct16x16_colcol_addblk_lsx(block, dst, stride);
1396cabdff1aSopenharmony_ci    }
1397cabdff1aSopenharmony_ci}
1398cabdff1aSopenharmony_ci
1399cabdff1aSopenharmony_civoid ff_idct_idct_32x32_add_lsx(uint8_t *dst, ptrdiff_t stride,
1400cabdff1aSopenharmony_ci                                int16_t *block, int eob)
1401cabdff1aSopenharmony_ci{
1402cabdff1aSopenharmony_ci    if (eob == 1) {
1403cabdff1aSopenharmony_ci        vp9_idct32x32_1_add_lsx(block, dst, stride);
1404cabdff1aSopenharmony_ci    }
1405cabdff1aSopenharmony_ci    else if (eob <= 34) {
1406cabdff1aSopenharmony_ci        vp9_idct32x32_34_colcol_addblk_lsx(block, dst, stride);
1407cabdff1aSopenharmony_ci    }
1408cabdff1aSopenharmony_ci    else {
1409cabdff1aSopenharmony_ci        vp9_idct32x32_colcol_addblk_lsx(block, dst, stride);
1410cabdff1aSopenharmony_ci    }
1411cabdff1aSopenharmony_ci}
1412