1/*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * Contributed by Hao Chen <chenhao@loongson.cn>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/loongarch/loongson_intrinsics.h"
23#include "idctdsp_loongarch.h"
24
25#define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \
26{                                                                              \
27    __m256i temp_0, temp_1, temp_2, temp_3;                                    \
28    __m256i temp_4, temp_5, temp_6, temp_7;                                    \
29    DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\
30              0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3);         \
31    DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\
32    DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\
33    DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2);  \
34    DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3);  \
35}
36
37#define LASX_IDCTROWCONDDC                                                     \
38    const_val  = 16383 * ((1 << 19) / 16383);                                  \
39    const_val1 = __lasx_xvreplgr2vr_w(const_val);                              \
40    DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,          \
41              in0, in1, in2, in3);                                             \
42    LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3);                \
43    a0 = __lasx_xvpermi_d(in0, 0xD8);                                          \
44    a0 = __lasx_vext2xv_w_h(a0);                                               \
45    temp  = __lasx_xvslli_w(a0, 3);                                            \
46    a1 = __lasx_xvpermi_d(in0, 0x8D);                                          \
47    a1 = __lasx_vext2xv_w_h(a1);                                               \
48    a2 = __lasx_xvpermi_d(in1, 0xD8);                                          \
49    a2 = __lasx_vext2xv_w_h(a2);                                               \
50    a3 = __lasx_xvpermi_d(in1, 0x8D);                                          \
51    a3 = __lasx_vext2xv_w_h(a3);                                               \
52    b0 = __lasx_xvpermi_d(in2, 0xD8);                                          \
53    b0 = __lasx_vext2xv_w_h(b0);                                               \
54    b1 = __lasx_xvpermi_d(in2, 0x8D);                                          \
55    b1 = __lasx_vext2xv_w_h(b1);                                               \
56    b2 = __lasx_xvpermi_d(in3, 0xD8);                                          \
57    b2 = __lasx_vext2xv_w_h(b2);                                               \
58    b3 = __lasx_xvpermi_d(in3, 0x8D);                                          \
59    b3 = __lasx_vext2xv_w_h(b3);                                               \
60    select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3;                        \
61    select_vec = __lasx_xvslti_wu(select_vec, 1);                              \
62                                                                               \
63    DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5,               \
64              w2, w3, w4, w5);                                                 \
65    DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7);                    \
66    w1 = __lasx_xvrepl128vei_h(w1, 1);                                         \
67                                                                               \
68    /* part of FUNC6(idctRowCondDC) */                                         \
69    temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4);                          \
70    DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2);             \
71    a0    = __lasx_xvadd_w(temp0, temp1);                                      \
72    a1    = __lasx_xvadd_w(temp0, temp2);                                      \
73    a2    = __lasx_xvsub_w(temp0, temp2);                                      \
74    a3    = __lasx_xvsub_w(temp0, temp1);                                      \
75                                                                               \
76    DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1);                \
77    b0 = __lasx_xvdp2_w_h(temp0, temp1);                                       \
78    temp1 = __lasx_xvneg_h(w7);                                                \
79    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
80    b1 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
81    temp1 = __lasx_xvneg_h(w1);                                                \
82    temp2 = __lasx_xvilvl_h(temp1, w5);                                        \
83    b2 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
84    temp1 = __lasx_xvneg_h(w5);                                                \
85    temp2 = __lasx_xvilvl_h(temp1, w7);                                        \
86    b3 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
87                                                                               \
88    /* if (AV_RAN64A(row + 4)) */                                              \
89    DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1);                \
90    a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1);                                \
91    temp1 = __lasx_xvilvl_h(w2, w4);                                           \
92    a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1);                                \
93    temp1 = __lasx_xvneg_h(w4);                                                \
94    temp2 = __lasx_xvilvl_h(w2, temp1);                                        \
95    a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2);                                \
96    temp1 = __lasx_xvneg_h(w6);                                                \
97    temp2 = __lasx_xvilvl_h(temp1, w4);                                        \
98    a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2);                                \
99                                                                               \
100    DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1);                \
101    b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1);                                \
102    DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2);                  \
103    b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1);                                \
104    b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2);                                \
105    temp1 = __lasx_xvneg_h(w1);                                                \
106    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
107    b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2);                                \
108                                                                               \
109    DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
110              temp0, temp1, temp2, temp3);                                     \
111    DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
112              a0, a1, a2, a3);                                                 \
113    DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11,     \
114              temp0, temp1, temp2, temp3);                                     \
115    DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\
116    DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp,         \
117              select_vec, temp2, temp, select_vec, temp3, temp, select_vec,    \
118              in0, in1, in2, in3);                                             \
119    DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp,               \
120              select_vec, a2, temp, select_vec, a3, temp, select_vec,          \
121              a0, a1, a2, a3);                                                 \
122    DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1,           \
123              in0, in1, in2, in3);                                             \
124    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,    \
125              in0, in1, in2, in3);                                             \
126
127#define LASX_IDCTCOLS                                                          \
128    /* part of FUNC6(idctSparaseCol) */                                        \
129    LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3);                \
130    temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4);                          \
131    DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2);             \
132    a0    = __lasx_xvadd_w(temp0, temp1);                                      \
133    a1    = __lasx_xvadd_w(temp0, temp2);                                      \
134    a2    = __lasx_xvsub_w(temp0, temp2);                                      \
135    a3    = __lasx_xvsub_w(temp0, temp1);                                      \
136                                                                               \
137    DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1);                \
138    b0 = __lasx_xvdp2_w_h(temp0, temp1);                                       \
139    temp1 = __lasx_xvneg_h(w7);                                                \
140    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
141    b1 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
142    temp1 = __lasx_xvneg_h(w1);                                                \
143    temp2 = __lasx_xvilvl_h(temp1, w5);                                        \
144    b2 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
145    temp1 = __lasx_xvneg_h(w5);                                                \
146    temp2 = __lasx_xvilvl_h(temp1, w7);                                        \
147    b3 = __lasx_xvdp2_w_h(temp0, temp2);                                       \
148                                                                               \
149    /* if (AV_RAN64A(row + 4)) */                                              \
150    DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1);                \
151    a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1);                                \
152    temp1 = __lasx_xvilvl_h(w2, w4);                                           \
153    a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1);                                \
154    temp1 = __lasx_xvneg_h(w4);                                                \
155    temp2 = __lasx_xvilvl_h(w2, temp1);                                        \
156    a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2);                                \
157    temp1 = __lasx_xvneg_h(w6);                                                \
158    temp2 = __lasx_xvilvl_h(temp1, w4);                                        \
159    a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2);                                \
160                                                                               \
161    DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1);                \
162    b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1);                                \
163    DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2);                  \
164    b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1);                                \
165    b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2);                                \
166    temp1 = __lasx_xvneg_h(w1);                                                \
167    temp2 = __lasx_xvilvl_h(temp1, w3);                                        \
168    b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2);                                \
169                                                                               \
170    DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3,                  \
171              temp0, temp1, temp2, temp3);                                     \
172    DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0,                  \
173              a3, a2, a1, a0);                                                 \
174    DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3,  \
175              20, a0, a1, 20, in0, in1, in2, in3);                             \
176
177void ff_simple_idct_lasx(int16_t *block)
178{
179    int32_t const_val = 1 << 10;
180    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
181                  0x4B42539F58C50000, 0x11A822A332493FFF};
182    __m256i in0, in1, in2, in3;
183    __m256i w2, w3, w4, w5, w6, w7;
184    __m256i a0, a1, a2, a3;
185    __m256i b0, b1, b2, b3;
186    __m256i temp0, temp1, temp2, temp3;
187    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
188    __m256i const_val1, select_vec, temp;
189
190    LASX_IDCTROWCONDDC
191    LASX_IDCTCOLS
192    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
193              in0, in1, in2, in3);
194    __lasx_xvst(in0, block, 0);
195    __lasx_xvst(in1, block, 32);
196    __lasx_xvst(in2, block, 64);
197    __lasx_xvst(in3, block, 96);
198}
199
200void ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride,
201                             int16_t *block)
202{
203    int32_t const_val = 1 << 10;
204    ptrdiff_t dst_stride_2x = dst_stride << 1;
205    ptrdiff_t dst_stride_4x = dst_stride << 2;
206    ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
207    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
208                  0x4B42539F58C50000, 0x11A822A332493FFF};
209    __m256i in0, in1, in2, in3;
210    __m256i w2, w3, w4, w5, w6, w7;
211    __m256i a0, a1, a2, a3;
212    __m256i b0, b1, b2, b3;
213    __m256i temp0, temp1, temp2, temp3;
214    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
215    __m256i const_val1, select_vec, temp;
216
217    LASX_IDCTROWCONDDC
218    LASX_IDCTCOLS
219    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
220              in0, in1, in2, in3);
221    DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
222    DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
223    __lasx_xvstelm_d(in0, dst, 0, 0);
224    __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
225    __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
226    __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
227    dst += dst_stride_4x;
228    __lasx_xvstelm_d(in1, dst, 0, 0);
229    __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
230    __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
231    __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
232}
233
234void ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride,
235                             int16_t *block)
236{
237    int32_t const_val = 1 << 10;
238    uint8_t *dst1 = dst;
239    ptrdiff_t dst_stride_2x = dst_stride << 1;
240    ptrdiff_t dst_stride_4x = dst_stride << 2;
241    ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
242
243    __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
244                  0x4B42539F58C50000, 0x11A822A332493FFF};
245    __m256i sh = {0x0003000200010000, 0x000B000A00090008,
246                  0x0007000600050004, 0x000F000E000D000C};
247    __m256i in0, in1, in2, in3;
248    __m256i w2, w3, w4, w5, w6, w7;
249    __m256i a0, a1, a2, a3;
250    __m256i b0, b1, b2, b3;
251    __m256i temp0, temp1, temp2, temp3;
252    __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
253    __m256i const_val1, select_vec, temp;
254
255    LASX_IDCTROWCONDDC
256    LASX_IDCTCOLS
257    a0    = __lasx_xvldrepl_d(dst1, 0);
258    a0    = __lasx_vext2xv_hu_bu(a0);
259    dst1 += dst_stride;
260    a1    = __lasx_xvldrepl_d(dst1, 0);
261    a1    = __lasx_vext2xv_hu_bu(a1);
262    dst1 += dst_stride;
263    a2    = __lasx_xvldrepl_d(dst1, 0);
264    a2    = __lasx_vext2xv_hu_bu(a2);
265    dst1 += dst_stride;
266    a3    = __lasx_xvldrepl_d(dst1, 0);
267    a3    = __lasx_vext2xv_hu_bu(a3);
268    dst1 += dst_stride;
269    b0    = __lasx_xvldrepl_d(dst1, 0);
270    b0    = __lasx_vext2xv_hu_bu(b0);
271    dst1 += dst_stride;
272    b1    = __lasx_xvldrepl_d(dst1, 0);
273    b1    = __lasx_vext2xv_hu_bu(b1);
274    dst1 += dst_stride;
275    b2    = __lasx_xvldrepl_d(dst1, 0);
276    b2    = __lasx_vext2xv_hu_bu(b2);
277    dst1 += dst_stride;
278    b3    = __lasx_xvldrepl_d(dst1, 0);
279    b3    = __lasx_vext2xv_hu_bu(b3);
280    DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2,
281              temp0, temp1, temp2, temp3);
282    DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3,
283              in0, in1, in2, in3);
284    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
285              in0, in1, in2, in3);
286    DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
287    DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
288    __lasx_xvstelm_d(in0, dst, 0, 0);
289    __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
290    __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
291    __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
292    dst += dst_stride_4x;
293    __lasx_xvstelm_d(in1, dst, 0, 0);
294    __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
295    __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
296    __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
297}
298