1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen <chenhao@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/loongarch/loongson_intrinsics.h"
23 #include "idctdsp_loongarch.h"
24 
25 #define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \
26 {                                                                              \
27     __m256i temp_0, temp_1, temp_2, temp_3;                                    \
28     __m256i temp_4, temp_5, temp_6, temp_7;                                    \
29     DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\
30               0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3);         \
31     DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\
32     DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\
33     DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2);  \
34     DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3);  \
35 }
36 
37 #define LASX_IDCTROWCONDDC                                                     \
38     const_val  = 16383 * ((1 << 19) / 16383);                                  \
39     const_val1 = __lasx_xvreplgr2vr_w(const_val);                              \
40     DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,          \
41               in0, in1, in2, in3);                                             \
42     LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3);                \
43     a0 = __lasx_xvpermi_d(in0, 0xD8);                                          \
44     a0 = __lasx_vext2xv_w_h(a0);                                               \
45     temp  = __lasx_xvslli_w(a0, 3);                                            \
46     a1 = __lasx_xvpermi_d(in0, 0x8D);                                          \
47     a1 = __lasx_vext2xv_w_h(a1);                                               \
48     a2 = __lasx_xvpermi_d(in1, 0xD8);                                          \
49     a2 = __lasx_vext2xv_w_h(a2);                                               \
50     a3 = __lasx_xvpermi_d(in1, 0x8D);                                          \
51     a3 = __lasx_vext2xv_w_h(a3);                                               \
52     b0 = __lasx_xvpermi_d(in2, 0xD8);                                          \
53     b0 = __lasx_vext2xv_w_h(b0);                                               \
54     b1 = __lasx_xvpermi_d(in2, 0x8D);                                          \
55     b1 = __lasx_vext2xv_w_h(b1);                                               \
56     b2 = __lasx_xvpermi_d(in3, 0xD8);                                          \
57     b2 = __lasx_vext2xv_w_h(b2);                                               \
58     b3 = __lasx_xvpermi_d(in3, 0x8D);                                          \
59     b3 = __lasx_vext2xv_w_h(b3);                                               \
60     select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3;                        \
61     select_vec = __lasx_xvslti_wu(select_vec, 1);                              \
62                                                                                \
63     DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5,               \
64               w2, w3, w4, w5);                                                 \
65     DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7);                    \
66     w1 = __lasx_xvrepl128vei_h(w1, 1);                                         \
67                                                                                \
68     /* part of FUNC6(idctRowCondDC) */                                         \
69     temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4);                          \
70     DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2);             \
71     a0    = __lasx_xvadd_w(temp0, temp1);                                      \
72     a1    = __lasx_xvadd_w(temp0, temp2);                                      \
73     a2    = __lasx_xvsub_w(temp0, temp2);                                      \
74     a3    = __lasx_xvsub_w(temp0, temp1);                                      \
75                                                                                \
76     DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1);                \
77     b0 = __lasx_xvdp2_w_h(temp0, temp1);                                       \
78     temp1 = __lasx_xvneg_h(w7);                                                \
79     temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
80     b1 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
81     temp1 = __lasx_xvneg_h(w1);                                                \
82     temp2 = __lasx_xvilvl_h(temp1, w5);                                        \
83     b2 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
84     temp1 = __lasx_xvneg_h(w5);                                                \
85     temp2 = __lasx_xvilvl_h(temp1, w7);                                        \
86     b3 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
87                                                                                \
88     /* if (AV_RAN64A(row + 4)) */                                              \
89     DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1);                \
90     a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1);                                \
91     temp1 = __lasx_xvilvl_h(w2, w4);                                           \
92     a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1);                                \
93     temp1 = __lasx_xvneg_h(w4);                                                \
94     temp2 = __lasx_xvilvl_h(w2, temp1);                                        \
95     a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2);                                \
96     temp1 = __lasx_xvneg_h(w6);                                                \
97     temp2 = __lasx_xvilvl_h(temp1, w4);                                        \
98     a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2);                                \
99                                                                                \
100     DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1);                \
101     b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1);                                \
102     DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2);                  \
103     b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1);                                \
104     b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2);                                \
105     temp1 = __lasx_xvneg_h(w1);                                                \
106     temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
107     b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2);                                \
108                                                                                \
109     DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
110               temp0, temp1, temp2, temp3);                                     \
111     DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
112               a0, a1, a2, a3);                                                 \
113     DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11,     \
114               temp0, temp1, temp2, temp3);                                     \
115     DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\
116     DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp,         \
117               select_vec, temp2, temp, select_vec, temp3, temp, select_vec,    \
118               in0, in1, in2, in3);                                             \
119     DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp,               \
120               select_vec, a2, temp, select_vec, a3, temp, select_vec,          \
121               a0, a1, a2, a3);                                                 \
122     DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1,           \
123               in0, in1, in2, in3);                                             \
124     DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,    \
125               in0, in1, in2, in3);                                             \
126 
127 #define LASX_IDCTCOLS                                                          \
128     /* part of FUNC6(idctSparaseCol) */                                        \
129     LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3);                \
130     temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4);                          \
131     DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2);             \
132     a0    = __lasx_xvadd_w(temp0, temp1);                                      \
133     a1    = __lasx_xvadd_w(temp0, temp2);                                      \
134     a2    = __lasx_xvsub_w(temp0, temp2);                                      \
135     a3    = __lasx_xvsub_w(temp0, temp1);                                      \
136                                                                                \
137     DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1);                \
138     b0 = __lasx_xvdp2_w_h(temp0, temp1);                                       \
139     temp1 = __lasx_xvneg_h(w7);                                                \
140     temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
141     b1 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
142     temp1 = __lasx_xvneg_h(w1);                                                \
143     temp2 = __lasx_xvilvl_h(temp1, w5);                                        \
144     b2 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
145     temp1 = __lasx_xvneg_h(w5);                                                \
146     temp2 = __lasx_xvilvl_h(temp1, w7);                                        \
147     b3 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
148                                                                                \
149     /* if (AV_RAN64A(row + 4)) */                                              \
150     DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1);                \
151     a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1);                                \
152     temp1 = __lasx_xvilvl_h(w2, w4);                                           \
153     a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1);                                \
154     temp1 = __lasx_xvneg_h(w4);                                                \
155     temp2 = __lasx_xvilvl_h(w2, temp1);                                        \
156     a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2);                                \
157     temp1 = __lasx_xvneg_h(w6);                                                \
158     temp2 = __lasx_xvilvl_h(temp1, w4);                                        \
159     a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2);                                \
160                                                                                \
161     DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1);                \
162     b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1);                                \
163     DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2);                  \
164     b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1);                                \
165     b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2);                                \
166     temp1 = __lasx_xvneg_h(w1);                                                \
167     temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
168     b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2);                                \
169                                                                                \
170     DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
171               temp0, temp1, temp2, temp3);                                     \
172     DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0,                  \
173               a3, a2, a1, a0);                                                 \
174     DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3,  \
175               20, a0, a1, 20, in0, in1, in2, in3);                             \
176 
ff_simple_idct_lasx(int16_t *block)177 void ff_simple_idct_lasx(int16_t *block)
178 {
179     int32_t const_val = 1 << 10;
180     __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
181                   0x4B42539F58C50000, 0x11A822A332493FFF};
182     __m256i in0, in1, in2, in3;
183     __m256i w2, w3, w4, w5, w6, w7;
184     __m256i a0, a1, a2, a3;
185     __m256i b0, b1, b2, b3;
186     __m256i temp0, temp1, temp2, temp3;
187     __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
188     __m256i const_val1, select_vec, temp;
189 
190     LASX_IDCTROWCONDDC
191     LASX_IDCTCOLS
192     DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
193               in0, in1, in2, in3);
194     __lasx_xvst(in0, block, 0);
195     __lasx_xvst(in1, block, 32);
196     __lasx_xvst(in2, block, 64);
197     __lasx_xvst(in3, block, 96);
198 }
199 
ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)200 void ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride,
201                              int16_t *block)
202 {
203     int32_t const_val = 1 << 10;
204     ptrdiff_t dst_stride_2x = dst_stride << 1;
205     ptrdiff_t dst_stride_4x = dst_stride << 2;
206     ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
207     __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
208                   0x4B42539F58C50000, 0x11A822A332493FFF};
209     __m256i in0, in1, in2, in3;
210     __m256i w2, w3, w4, w5, w6, w7;
211     __m256i a0, a1, a2, a3;
212     __m256i b0, b1, b2, b3;
213     __m256i temp0, temp1, temp2, temp3;
214     __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
215     __m256i const_val1, select_vec, temp;
216 
217     LASX_IDCTROWCONDDC
218     LASX_IDCTCOLS
219     DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
220               in0, in1, in2, in3);
221     DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
222     DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
223     __lasx_xvstelm_d(in0, dst, 0, 0);
224     __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
225     __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
226     __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
227     dst += dst_stride_4x;
228     __lasx_xvstelm_d(in1, dst, 0, 0);
229     __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
230     __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
231     __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
232 }
233 
ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)234 void ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride,
235                              int16_t *block)
236 {
237     int32_t const_val = 1 << 10;
238     uint8_t *dst1 = dst;
239     ptrdiff_t dst_stride_2x = dst_stride << 1;
240     ptrdiff_t dst_stride_4x = dst_stride << 2;
241     ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
242 
243     __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
244                   0x4B42539F58C50000, 0x11A822A332493FFF};
245     __m256i sh = {0x0003000200010000, 0x000B000A00090008,
246                   0x0007000600050004, 0x000F000E000D000C};
247     __m256i in0, in1, in2, in3;
248     __m256i w2, w3, w4, w5, w6, w7;
249     __m256i a0, a1, a2, a3;
250     __m256i b0, b1, b2, b3;
251     __m256i temp0, temp1, temp2, temp3;
252     __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
253     __m256i const_val1, select_vec, temp;
254 
255     LASX_IDCTROWCONDDC
256     LASX_IDCTCOLS
257     a0    = __lasx_xvldrepl_d(dst1, 0);
258     a0    = __lasx_vext2xv_hu_bu(a0);
259     dst1 += dst_stride;
260     a1    = __lasx_xvldrepl_d(dst1, 0);
261     a1    = __lasx_vext2xv_hu_bu(a1);
262     dst1 += dst_stride;
263     a2    = __lasx_xvldrepl_d(dst1, 0);
264     a2    = __lasx_vext2xv_hu_bu(a2);
265     dst1 += dst_stride;
266     a3    = __lasx_xvldrepl_d(dst1, 0);
267     a3    = __lasx_vext2xv_hu_bu(a3);
268     dst1 += dst_stride;
269     b0    = __lasx_xvldrepl_d(dst1, 0);
270     b0    = __lasx_vext2xv_hu_bu(b0);
271     dst1 += dst_stride;
272     b1    = __lasx_xvldrepl_d(dst1, 0);
273     b1    = __lasx_vext2xv_hu_bu(b1);
274     dst1 += dst_stride;
275     b2    = __lasx_xvldrepl_d(dst1, 0);
276     b2    = __lasx_vext2xv_hu_bu(b2);
277     dst1 += dst_stride;
278     b3    = __lasx_xvldrepl_d(dst1, 0);
279     b3    = __lasx_vext2xv_hu_bu(b3);
280     DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2,
281               temp0, temp1, temp2, temp3);
282     DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3,
283               in0, in1, in2, in3);
284     DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
285               in0, in1, in2, in3);
286     DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
287     DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
288     __lasx_xvstelm_d(in0, dst, 0, 0);
289     __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
290     __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
291     __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
292     dst += dst_stride_4x;
293     __lasx_xvstelm_d(in1, dst, 0, 0);
294     __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
295     __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
296     __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
297 }
298