1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited
3cabdff1aSopenharmony_ci * Contributed by Hao Chen <chenhao@loongson.cn>
4cabdff1aSopenharmony_ci *
5cabdff1aSopenharmony_ci * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci */
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h"
23cabdff1aSopenharmony_ci#include "idctdsp_loongarch.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci#define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \
26cabdff1aSopenharmony_ci{                                                                              \
27cabdff1aSopenharmony_ci    __m256i temp_0, temp_1, temp_2, temp_3;                                    \
28cabdff1aSopenharmony_ci    __m256i temp_4, temp_5, temp_6, temp_7;                                    \
29cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\
30cabdff1aSopenharmony_ci              0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3);         \
31cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\
32cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\
33cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2);  \
34cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3);  \
35cabdff1aSopenharmony_ci}
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci#define LASX_IDCTROWCONDDC                                                     \
38cabdff1aSopenharmony_ci    const_val  = 16383 * ((1 << 19) / 16383);                                  \
39cabdff1aSopenharmony_ci    const_val1 = __lasx_xvreplgr2vr_w(const_val);                              \
40cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,          \
41cabdff1aSopenharmony_ci              in0, in1, in2, in3);                                             \
42cabdff1aSopenharmony_ci    LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3);                \
43cabdff1aSopenharmony_ci    a0 = __lasx_xvpermi_d(in0, 0xD8);                                          \
44cabdff1aSopenharmony_ci    a0 = __lasx_vext2xv_w_h(a0);                                               \
45cabdff1aSopenharmony_ci    temp  = __lasx_xvslli_w(a0, 3);                                            \
46cabdff1aSopenharmony_ci    a1 = __lasx_xvpermi_d(in0, 0x8D);                                          \
47cabdff1aSopenharmony_ci    a1 = __lasx_vext2xv_w_h(a1);                                               \
48cabdff1aSopenharmony_ci    a2 = __lasx_xvpermi_d(in1, 0xD8);                                          \
49cabdff1aSopenharmony_ci    a2 = __lasx_vext2xv_w_h(a2);                                               \
50cabdff1aSopenharmony_ci    a3 = __lasx_xvpermi_d(in1, 0x8D);                                          \
51cabdff1aSopenharmony_ci    a3 = __lasx_vext2xv_w_h(a3);                                               \
52cabdff1aSopenharmony_ci    b0 = __lasx_xvpermi_d(in2, 0xD8);                                          \
53cabdff1aSopenharmony_ci    b0 = __lasx_vext2xv_w_h(b0);                                               \
54cabdff1aSopenharmony_ci    b1 = __lasx_xvpermi_d(in2, 0x8D);                                          \
55cabdff1aSopenharmony_ci    b1 = __lasx_vext2xv_w_h(b1);                                               \
56cabdff1aSopenharmony_ci    b2 = __lasx_xvpermi_d(in3, 0xD8);                                          \
57cabdff1aSopenharmony_ci    b2 = __lasx_vext2xv_w_h(b2);                                               \
58cabdff1aSopenharmony_ci    b3 = __lasx_xvpermi_d(in3, 0x8D);                                          \
59cabdff1aSopenharmony_ci    b3 = __lasx_vext2xv_w_h(b3);                                               \
60cabdff1aSopenharmony_ci    select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3;                        \
61cabdff1aSopenharmony_ci    select_vec = __lasx_xvslti_wu(select_vec, 1);                              \
62cabdff1aSopenharmony_ci                                                                               \
63cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5,               \
64cabdff1aSopenharmony_ci              w2, w3, w4, w5);                                                 \
65cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7);                    \
66cabdff1aSopenharmony_ci    w1 = __lasx_xvrepl128vei_h(w1, 1);                                         \
67cabdff1aSopenharmony_ci                                                                               \
68cabdff1aSopenharmony_ci    /* part of FUNC6(idctRowCondDC) */                                         \
69cabdff1aSopenharmony_ci    temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4);                          \
70cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2);             \
71cabdff1aSopenharmony_ci    a0    = __lasx_xvadd_w(temp0, temp1);                                      \
72cabdff1aSopenharmony_ci    a1    = __lasx_xvadd_w(temp0, temp2);                                      \
73cabdff1aSopenharmony_ci    a2    = __lasx_xvsub_w(temp0, temp2);                                      \
74cabdff1aSopenharmony_ci    a3    = __lasx_xvsub_w(temp0, temp1);                                      \
75cabdff1aSopenharmony_ci                                                                               \
76cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1);                \
77cabdff1aSopenharmony_ci    b0 = __lasx_xvdp2_w_h(temp0, temp1);                                       \
78cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w7);                                                \
79cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
80cabdff1aSopenharmony_ci    b1 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
81cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w1);                                                \
82cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w5);                                        \
83cabdff1aSopenharmony_ci    b2 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
84cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w5);                                                \
85cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w7);                                        \
86cabdff1aSopenharmony_ci    b3 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
87cabdff1aSopenharmony_ci                                                                               \
88cabdff1aSopenharmony_ci    /* if (AV_RAN64A(row + 4)) */                                              \
89cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1);                \
90cabdff1aSopenharmony_ci    a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1);                                \
91cabdff1aSopenharmony_ci    temp1 = __lasx_xvilvl_h(w2, w4);                                           \
92cabdff1aSopenharmony_ci    a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1);                                \
93cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w4);                                                \
94cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(w2, temp1);                                        \
95cabdff1aSopenharmony_ci    a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2);                                \
96cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w6);                                                \
97cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w4);                                        \
98cabdff1aSopenharmony_ci    a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2);                                \
99cabdff1aSopenharmony_ci                                                                               \
100cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1);                \
101cabdff1aSopenharmony_ci    b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1);                                \
102cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2);                  \
103cabdff1aSopenharmony_ci    b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1);                                \
104cabdff1aSopenharmony_ci    b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2);                                \
105cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w1);                                                \
106cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
107cabdff1aSopenharmony_ci    b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2);                                \
108cabdff1aSopenharmony_ci                                                                               \
109cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
110cabdff1aSopenharmony_ci              temp0, temp1, temp2, temp3);                                     \
111cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
112cabdff1aSopenharmony_ci              a0, a1, a2, a3);                                                 \
113cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11,     \
114cabdff1aSopenharmony_ci              temp0, temp1, temp2, temp3);                                     \
115cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\
116cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp,         \
117cabdff1aSopenharmony_ci              select_vec, temp2, temp, select_vec, temp3, temp, select_vec,    \
118cabdff1aSopenharmony_ci              in0, in1, in2, in3);                                             \
119cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp,               \
120cabdff1aSopenharmony_ci              select_vec, a2, temp, select_vec, a3, temp, select_vec,          \
121cabdff1aSopenharmony_ci              a0, a1, a2, a3);                                                 \
122cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1,           \
123cabdff1aSopenharmony_ci              in0, in1, in2, in3);                                             \
124cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,    \
125cabdff1aSopenharmony_ci              in0, in1, in2, in3);                                             \
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci#define LASX_IDCTCOLS                                                          \
128cabdff1aSopenharmony_ci    /* part of FUNC6(idctSparaseCol) */                                        \
129cabdff1aSopenharmony_ci    LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3);                \
130cabdff1aSopenharmony_ci    temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4);                          \
131cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2);             \
132cabdff1aSopenharmony_ci    a0    = __lasx_xvadd_w(temp0, temp1);                                      \
133cabdff1aSopenharmony_ci    a1    = __lasx_xvadd_w(temp0, temp2);                                      \
134cabdff1aSopenharmony_ci    a2    = __lasx_xvsub_w(temp0, temp2);                                      \
135cabdff1aSopenharmony_ci    a3    = __lasx_xvsub_w(temp0, temp1);                                      \
136cabdff1aSopenharmony_ci                                                                               \
137cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1);                \
138cabdff1aSopenharmony_ci    b0 = __lasx_xvdp2_w_h(temp0, temp1);                                       \
139cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w7);                                                \
140cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
141cabdff1aSopenharmony_ci    b1 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
142cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w1);                                                \
143cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w5);                                        \
144cabdff1aSopenharmony_ci    b2 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
145cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w5);                                                \
146cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w7);                                        \
147cabdff1aSopenharmony_ci    b3 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
148cabdff1aSopenharmony_ci                                                                               \
149cabdff1aSopenharmony_ci    /* if (AV_RAN64A(row + 4)) */                                              \
150cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1);                \
151cabdff1aSopenharmony_ci    a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1);                                \
152cabdff1aSopenharmony_ci    temp1 = __lasx_xvilvl_h(w2, w4);                                           \
153cabdff1aSopenharmony_ci    a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1);                                \
154cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w4);                                                \
155cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(w2, temp1);                                        \
156cabdff1aSopenharmony_ci    a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2);                                \
157cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w6);                                                \
158cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w4);                                        \
159cabdff1aSopenharmony_ci    a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2);                                \
160cabdff1aSopenharmony_ci                                                                               \
161cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1);                \
162cabdff1aSopenharmony_ci    b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1);                                \
163cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2);                  \
164cabdff1aSopenharmony_ci    b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1);                                \
165cabdff1aSopenharmony_ci    b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2);                                \
166cabdff1aSopenharmony_ci    temp1 = __lasx_xvneg_h(w1);                                                \
167cabdff1aSopenharmony_ci    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
168cabdff1aSopenharmony_ci    b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2);                                \
169cabdff1aSopenharmony_ci                                                                               \
170cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
171cabdff1aSopenharmony_ci              temp0, temp1, temp2, temp3);                                     \
172cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0,                  \
173cabdff1aSopenharmony_ci              a3, a2, a1, a0);                                                 \
174cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3,  \
175cabdff1aSopenharmony_ci              20, a0, a1, 20, in0, in1, in2, in3);                             \
176cabdff1aSopenharmony_ci
177cabdff1aSopenharmony_civoid ff_simple_idct_lasx(int16_t *block)
178cabdff1aSopenharmony_ci{
179cabdff1aSopenharmony_ci    int32_t const_val = 1 << 10;
180cabdff1aSopenharmony_ci    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
181cabdff1aSopenharmony_ci                  0x4B42539F58C50000, 0x11A822A332493FFF};
182cabdff1aSopenharmony_ci    __m256i in0, in1, in2, in3;
183cabdff1aSopenharmony_ci    __m256i w2, w3, w4, w5, w6, w7;
184cabdff1aSopenharmony_ci    __m256i a0, a1, a2, a3;
185cabdff1aSopenharmony_ci    __m256i b0, b1, b2, b3;
186cabdff1aSopenharmony_ci    __m256i temp0, temp1, temp2, temp3;
187cabdff1aSopenharmony_ci    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
188cabdff1aSopenharmony_ci    __m256i const_val1, select_vec, temp;
189cabdff1aSopenharmony_ci
190cabdff1aSopenharmony_ci    LASX_IDCTROWCONDDC
191cabdff1aSopenharmony_ci    LASX_IDCTCOLS
192cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
193cabdff1aSopenharmony_ci              in0, in1, in2, in3);
194cabdff1aSopenharmony_ci    __lasx_xvst(in0, block, 0);
195cabdff1aSopenharmony_ci    __lasx_xvst(in1, block, 32);
196cabdff1aSopenharmony_ci    __lasx_xvst(in2, block, 64);
197cabdff1aSopenharmony_ci    __lasx_xvst(in3, block, 96);
198cabdff1aSopenharmony_ci}
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_civoid ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride,
201cabdff1aSopenharmony_ci                             int16_t *block)
202cabdff1aSopenharmony_ci{
203cabdff1aSopenharmony_ci    int32_t const_val = 1 << 10;
204cabdff1aSopenharmony_ci    ptrdiff_t dst_stride_2x = dst_stride << 1;
205cabdff1aSopenharmony_ci    ptrdiff_t dst_stride_4x = dst_stride << 2;
206cabdff1aSopenharmony_ci    ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
207cabdff1aSopenharmony_ci    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
208cabdff1aSopenharmony_ci                  0x4B42539F58C50000, 0x11A822A332493FFF};
209cabdff1aSopenharmony_ci    __m256i in0, in1, in2, in3;
210cabdff1aSopenharmony_ci    __m256i w2, w3, w4, w5, w6, w7;
211cabdff1aSopenharmony_ci    __m256i a0, a1, a2, a3;
212cabdff1aSopenharmony_ci    __m256i b0, b1, b2, b3;
213cabdff1aSopenharmony_ci    __m256i temp0, temp1, temp2, temp3;
214cabdff1aSopenharmony_ci    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
215cabdff1aSopenharmony_ci    __m256i const_val1, select_vec, temp;
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci    LASX_IDCTROWCONDDC
218cabdff1aSopenharmony_ci    LASX_IDCTCOLS
219cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
220cabdff1aSopenharmony_ci              in0, in1, in2, in3);
221cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
222cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
223cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in0, dst, 0, 0);
224cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
225cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
226cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
227cabdff1aSopenharmony_ci    dst += dst_stride_4x;
228cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in1, dst, 0, 0);
229cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
230cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
231cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
232cabdff1aSopenharmony_ci}
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_civoid ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride,
235cabdff1aSopenharmony_ci                             int16_t *block)
236cabdff1aSopenharmony_ci{
237cabdff1aSopenharmony_ci    int32_t const_val = 1 << 10;
238cabdff1aSopenharmony_ci    uint8_t *dst1 = dst;
239cabdff1aSopenharmony_ci    ptrdiff_t dst_stride_2x = dst_stride << 1;
240cabdff1aSopenharmony_ci    ptrdiff_t dst_stride_4x = dst_stride << 2;
241cabdff1aSopenharmony_ci    ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
242cabdff1aSopenharmony_ci
243cabdff1aSopenharmony_ci    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
244cabdff1aSopenharmony_ci                  0x4B42539F58C50000, 0x11A822A332493FFF};
245cabdff1aSopenharmony_ci    __m256i sh = {0x0003000200010000, 0x000B000A00090008,
246cabdff1aSopenharmony_ci                  0x0007000600050004, 0x000F000E000D000C};
247cabdff1aSopenharmony_ci    __m256i in0, in1, in2, in3;
248cabdff1aSopenharmony_ci    __m256i w2, w3, w4, w5, w6, w7;
249cabdff1aSopenharmony_ci    __m256i a0, a1, a2, a3;
250cabdff1aSopenharmony_ci    __m256i b0, b1, b2, b3;
251cabdff1aSopenharmony_ci    __m256i temp0, temp1, temp2, temp3;
252cabdff1aSopenharmony_ci    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
253cabdff1aSopenharmony_ci    __m256i const_val1, select_vec, temp;
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci    LASX_IDCTROWCONDDC
256cabdff1aSopenharmony_ci    LASX_IDCTCOLS
257cabdff1aSopenharmony_ci    a0    = __lasx_xvldrepl_d(dst1, 0);
258cabdff1aSopenharmony_ci    a0    = __lasx_vext2xv_hu_bu(a0);
259cabdff1aSopenharmony_ci    dst1 += dst_stride;
260cabdff1aSopenharmony_ci    a1    = __lasx_xvldrepl_d(dst1, 0);
261cabdff1aSopenharmony_ci    a1    = __lasx_vext2xv_hu_bu(a1);
262cabdff1aSopenharmony_ci    dst1 += dst_stride;
263cabdff1aSopenharmony_ci    a2    = __lasx_xvldrepl_d(dst1, 0);
264cabdff1aSopenharmony_ci    a2    = __lasx_vext2xv_hu_bu(a2);
265cabdff1aSopenharmony_ci    dst1 += dst_stride;
266cabdff1aSopenharmony_ci    a3    = __lasx_xvldrepl_d(dst1, 0);
267cabdff1aSopenharmony_ci    a3    = __lasx_vext2xv_hu_bu(a3);
268cabdff1aSopenharmony_ci    dst1 += dst_stride;
269cabdff1aSopenharmony_ci    b0    = __lasx_xvldrepl_d(dst1, 0);
270cabdff1aSopenharmony_ci    b0    = __lasx_vext2xv_hu_bu(b0);
271cabdff1aSopenharmony_ci    dst1 += dst_stride;
272cabdff1aSopenharmony_ci    b1    = __lasx_xvldrepl_d(dst1, 0);
273cabdff1aSopenharmony_ci    b1    = __lasx_vext2xv_hu_bu(b1);
274cabdff1aSopenharmony_ci    dst1 += dst_stride;
275cabdff1aSopenharmony_ci    b2    = __lasx_xvldrepl_d(dst1, 0);
276cabdff1aSopenharmony_ci    b2    = __lasx_vext2xv_hu_bu(b2);
277cabdff1aSopenharmony_ci    dst1 += dst_stride;
278cabdff1aSopenharmony_ci    b3    = __lasx_xvldrepl_d(dst1, 0);
279cabdff1aSopenharmony_ci    b3    = __lasx_vext2xv_hu_bu(b3);
280cabdff1aSopenharmony_ci    DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2,
281cabdff1aSopenharmony_ci              temp0, temp1, temp2, temp3);
282cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3,
283cabdff1aSopenharmony_ci              in0, in1, in2, in3);
284cabdff1aSopenharmony_ci    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
285cabdff1aSopenharmony_ci              in0, in1, in2, in3);
286cabdff1aSopenharmony_ci    DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
287cabdff1aSopenharmony_ci    DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
288cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in0, dst, 0, 0);
289cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
290cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
291cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
292cabdff1aSopenharmony_ci    dst += dst_stride_4x;
293cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in1, dst, 0, 0);
294cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
295cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
296cabdff1aSopenharmony_ci    __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
297cabdff1aSopenharmony_ci}
298