1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Jin Bo <jinbo@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavcodec/vp9dsp.h"
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 #include "vp9dsp_loongarch.h"
25 #include "libavutil/attributes.h"
26 
27 #define VP9_DCT_CONST_BITS   14
28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned(align)))
29 #define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
30 
31 const int32_t cospi_1_64 = 16364;
32 const int32_t cospi_2_64 = 16305;
33 const int32_t cospi_3_64 = 16207;
34 const int32_t cospi_4_64 = 16069;
35 const int32_t cospi_5_64 = 15893;
36 const int32_t cospi_6_64 = 15679;
37 const int32_t cospi_7_64 = 15426;
38 const int32_t cospi_8_64 = 15137;
39 const int32_t cospi_9_64 = 14811;
40 const int32_t cospi_10_64 = 14449;
41 const int32_t cospi_11_64 = 14053;
42 const int32_t cospi_12_64 = 13623;
43 const int32_t cospi_13_64 = 13160;
44 const int32_t cospi_14_64 = 12665;
45 const int32_t cospi_15_64 = 12140;
46 const int32_t cospi_16_64 = 11585;
47 const int32_t cospi_17_64 = 11003;
48 const int32_t cospi_18_64 = 10394;
49 const int32_t cospi_19_64 = 9760;
50 const int32_t cospi_20_64 = 9102;
51 const int32_t cospi_21_64 = 8423;
52 const int32_t cospi_22_64 = 7723;
53 const int32_t cospi_23_64 = 7005;
54 const int32_t cospi_24_64 = 6270;
55 const int32_t cospi_25_64 = 5520;
56 const int32_t cospi_26_64 = 4756;
57 const int32_t cospi_27_64 = 3981;
58 const int32_t cospi_28_64 = 3196;
59 const int32_t cospi_29_64 = 2404;
60 const int32_t cospi_30_64 = 1606;
61 const int32_t cospi_31_64 = 804;
62 
63 const int32_t sinpi_1_9 = 5283;
64 const int32_t sinpi_2_9 = 9929;
65 const int32_t sinpi_3_9 = 13377;
66 const int32_t sinpi_4_9 = 15212;
67 
68 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)  \
69 {                                                                  \
70     __m128i k0_m = __lsx_vreplgr2vr_h(cnst0);                      \
71     __m128i s0_m, s1_m, s2_m, s3_m;                                \
72                                                                    \
73     s0_m = __lsx_vreplgr2vr_h(cnst1);                              \
74     k0_m = __lsx_vpackev_h(s0_m, k0_m);                            \
75                                                                    \
76     s1_m = __lsx_vilvl_h(__lsx_vneg_h(reg1), reg0);                \
77     s0_m = __lsx_vilvh_h(__lsx_vneg_h(reg1), reg0);                \
78     s3_m = __lsx_vilvl_h(reg0, reg1);                              \
79     s2_m = __lsx_vilvh_h(reg0, reg1);                              \
80     DUP2_ARG2(__lsx_vdp2_w_h, s1_m, k0_m, s0_m, k0_m, s1_m, s0_m); \
81     DUP2_ARG2(__lsx_vsrari_w, s1_m, VP9_DCT_CONST_BITS,            \
82               s0_m, VP9_DCT_CONST_BITS, s1_m, s0_m);               \
83     out0 = __lsx_vpickev_h(s0_m, s1_m);                            \
84     DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k0_m, s2_m, k0_m, s1_m, s0_m); \
85     DUP2_ARG2(__lsx_vsrari_w, s1_m, VP9_DCT_CONST_BITS,            \
86               s0_m, VP9_DCT_CONST_BITS, s1_m, s0_m);               \
87     out1 = __lsx_vpickev_h(s0_m, s1_m);                            \
88 }
89 
90 #define VP9_SET_COSPI_PAIR(c0_h, c1_h)    \
91 ( {                                       \
92     __m128i out0_m, r0_m, r1_m;           \
93                                           \
94     r0_m = __lsx_vreplgr2vr_h(c0_h);      \
95     r1_m = __lsx_vreplgr2vr_h(c1_h);      \
96     out0_m = __lsx_vpackev_h(r1_m, r0_m); \
97                                           \
98     out0_m;                               \
99 } )
100 
101 #define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)      \
102 {                                                                     \
103     uint8_t *dst_m = (uint8_t *) (dst);                               \
104     __m128i dst0_m, dst1_m, dst2_m, dst3_m;                           \
105     __m128i tmp0_m, tmp1_m;                                           \
106     __m128i res0_m, res1_m, res2_m, res3_m;                           \
107     __m128i zero_m = __lsx_vldi(0);                                   \
108     DUP4_ARG2(__lsx_vld, dst_m, 0, dst_m + dst_stride, 0,             \
109               dst_m + 2 * dst_stride, 0, dst_m + 3 * dst_stride, 0,   \
110               dst0_m, dst1_m, dst2_m, dst3_m);                        \
111     DUP4_ARG2(__lsx_vilvl_b, zero_m, dst0_m, zero_m, dst1_m, zero_m,  \
112               dst2_m, zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);\
113     DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2,    \
114               res3_m, in3, res0_m, res1_m, res2_m, res3_m);           \
115     DUP4_ARG1(__lsx_vclip255_h, res0_m, res1_m, res2_m, res3_m,       \
116               res0_m, res1_m, res2_m, res3_m);                        \
117     DUP2_ARG2(__lsx_vpickev_b, res1_m, res0_m, res3_m, res2_m,        \
118               tmp0_m, tmp1_m);                                        \
119     __lsx_vstelm_d(tmp0_m, dst_m, 0, 0);                              \
120     __lsx_vstelm_d(tmp0_m, dst_m + dst_stride, 0, 1);                 \
121     __lsx_vstelm_d(tmp1_m, dst_m + 2 * dst_stride, 0, 0);             \
122     __lsx_vstelm_d(tmp1_m, dst_m + 3 * dst_stride, 0, 1);             \
123 }
124 
125 #define VP9_UNPCK_UB_SH(in, out_h, out_l) \
126 {                                         \
127     __m128i zero = __lsx_vldi(0);         \
128     out_l = __lsx_vilvl_b(zero, in);      \
129     out_h = __lsx_vilvh_b(zero, in);      \
130 }
131 
132 #define VP9_ILVLTRANS4x8_H(in0, in1, in2, in3, in4, in5, in6, in7,          \
133                            out0, out1, out2, out3, out4, out5, out6, out7)  \
134 {                                                                           \
135     __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
136     __m128i tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                 \
137     __m128i zero_m = __lsx_vldi(0);                                         \
138                                                                             \
139     DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,        \
140               tmp0_n, tmp1_n, tmp2_n, tmp3_n);                              \
141     tmp0_m = __lsx_vilvl_w(tmp1_n, tmp0_n);                                 \
142     tmp2_m = __lsx_vilvh_w(tmp1_n, tmp0_n);                                 \
143     tmp1_m = __lsx_vilvl_w(tmp3_n, tmp2_n);                                 \
144     tmp3_m = __lsx_vilvh_w(tmp3_n, tmp2_n);                                 \
145                                                                             \
146     out0 = __lsx_vilvl_d(tmp1_m, tmp0_m);                                   \
147     out1 = __lsx_vilvh_d(tmp1_m, tmp0_m);                                   \
148     out2 = __lsx_vilvl_d(tmp3_m, tmp2_m);                                   \
149     out3 = __lsx_vilvh_d(tmp3_m, tmp2_m);                                   \
150                                                                             \
151     out4 = zero_m;                                                          \
152     out5 = zero_m;                                                          \
153     out6 = zero_m;                                                          \
154     out7 = zero_m;                                                          \
155 }
156 
157 /* multiply and add macro */
158 #define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,            \
159                  out0, out1, out2, out3)                                    \
160 {                                                                           \
161     __m128i madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
162     __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
163                                                                             \
164     madd_s1_m = __lsx_vilvl_h(inp1, inp0);                                  \
165     madd_s0_m = __lsx_vilvh_h(inp1, inp0);                                  \
166     madd_s3_m = __lsx_vilvl_h(inp3, inp2);                                  \
167     madd_s2_m = __lsx_vilvh_h(inp3, inp2);                                  \
168     DUP4_ARG2(__lsx_vdp2_w_h, madd_s1_m, cst0, madd_s0_m, cst0,             \
169               madd_s1_m, cst1, madd_s0_m, cst1, tmp0_m, tmp1_m,             \
170               tmp2_m, tmp3_m);                                              \
171     DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS, tmp1_m,           \
172               VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS, tmp3_m,       \
173               VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m);          \
174     DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
175     DUP4_ARG2(__lsx_vdp2_w_h, madd_s3_m, cst2, madd_s2_m, cst2, madd_s3_m,  \
176               cst3, madd_s2_m, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);       \
177     DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS,                   \
178               tmp1_m, VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS,       \
179               tmp3_m, VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
180     DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \
181 }
182 
183 #define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)                           \
184 ( {                                                                          \
185     __m128i c0_m, c1_m;                                                      \
186                                                                              \
187     DUP2_ARG2(__lsx_vreplvei_h, mask_h, idx1_h, mask_h, idx2_h, c0_m, c1_m); \
188     c0_m = __lsx_vpackev_h(c1_m, c0_m);                                      \
189                                                                              \
190     c0_m;                                                                    \
191 } )
192 
193 /* idct 8x8 macro */
194 #define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,                 \
195                        out0, out1, out2, out3, out4, out5, out6, out7)         \
196 {                                                                              \
197     __m128i tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;            \
198     __m128i k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;            \
199     __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
200     v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
201           cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };              \
202                                                                                \
203     k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
204     k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
205     k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
206     k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
207     VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
208     DUP2_ARG2(__lsx_vsub_h, in1, in3, in7, in5, res0_m, res1_m);               \
209     k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
210     k1_m = __lsx_vreplvei_h(mask_m, 4);                                        \
211                                                                                \
212     res2_m = __lsx_vilvl_h(res0_m, res1_m);                                    \
213     res3_m = __lsx_vilvh_h(res0_m, res1_m);                                    \
214     DUP4_ARG2(__lsx_vdp2_w_h, res2_m, k0_m, res3_m, k0_m, res2_m, k1_m,        \
215               res3_m, k1_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
216     DUP4_ARG2(__lsx_vsrari_w, tmp0_m, VP9_DCT_CONST_BITS,                      \
217               tmp1_m, VP9_DCT_CONST_BITS, tmp2_m, VP9_DCT_CONST_BITS,          \
218               tmp3_m, VP9_DCT_CONST_BITS, tmp0_m, tmp1_m, tmp2_m, tmp3_m);     \
219     tp4_m = __lsx_vadd_h(in1, in3);                                            \
220     DUP2_ARG2(__lsx_vpickev_h, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);  \
221     tp7_m = __lsx_vadd_h(in7, in5);                                            \
222     k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
223     k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
224     VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
225              in0, in4, in2, in6);                                              \
226     LSX_BUTTERFLY_4_H(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);         \
227     LSX_BUTTERFLY_8_H(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,  \
228                   out0, out1, out2, out3, out4, out5, out6, out7);             \
229 }
230 
231 static av_always_inline
vp9_idct8x8_1_add_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)232 void vp9_idct8x8_1_add_lsx(int16_t *input, uint8_t *dst,
233                                   int32_t dst_stride)
234 {
235     int16_t out;
236     int32_t val;
237     __m128i vec;
238 
239     out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
240     out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
241     val = ROUND_POWER_OF_TWO(out, 5);
242     vec = __lsx_vreplgr2vr_h(val);
243     input[0] = 0;
244 
245     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
246     dst += (4 * dst_stride);
247     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
248 }
249 
vp9_idct8x8_12_colcol_addblk_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)250 static void vp9_idct8x8_12_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
251                                              int32_t dst_stride)
252 {
253     __m128i in0, in1, in2, in3, in4, in5, in6, in7;
254     __m128i s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
255     __m128i tmp0, tmp1, tmp2, tmp3;
256     __m128i zero = __lsx_vldi(0);
257 
258     /* load vector elements of 8x8 block */
259     DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48,
260               in0, in1, in2, in3);
261     DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112,
262               in4, in5, in6, in7);
263     __lsx_vst(zero, input, 0);
264     __lsx_vst(zero, input, 16);
265     __lsx_vst(zero, input, 32);
266     __lsx_vst(zero, input, 48);
267     __lsx_vst(zero, input, 64);
268     __lsx_vst(zero, input, 80);
269     __lsx_vst(zero, input, 96);
270     __lsx_vst(zero, input, 112);
271     DUP4_ARG2(__lsx_vilvl_d,in1, in0, in3, in2, in5, in4, in7,
272               in6, in0, in1, in2, in3);
273 
274     /* stage1 */
275     DUP2_ARG2(__lsx_vilvh_h, in3, in0, in2, in1, s0, s1);
276     k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
277     k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
278     k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
279     k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
280     DUP4_ARG2(__lsx_vdp2_w_h, s0, k0, s0, k1, s1, k2, s1, k3,
281               tmp0, tmp1, tmp2, tmp3);
282     DUP4_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1,
283               VP9_DCT_CONST_BITS, tmp2, VP9_DCT_CONST_BITS, tmp3,
284               VP9_DCT_CONST_BITS, tmp0, tmp1, tmp2, tmp3);
285     DUP4_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, zero, tmp2, zero, tmp3,
286               s0, s1, s2, s3);
287     LSX_BUTTERFLY_4_H(s0, s1, s3, s2, s4, s7, s6, s5);
288 
289     /* stage2 */
290     DUP2_ARG2(__lsx_vilvl_h, in3, in1, in2, in0, s1, s0);
291     k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
292     k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
293     k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
294     k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
295     DUP4_ARG2(__lsx_vdp2_w_h, s0, k0, s0, k1, s1, k2, s1, k3,
296                   tmp0, tmp1, tmp2, tmp3);
297     DUP4_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1,
298               VP9_DCT_CONST_BITS, tmp2, VP9_DCT_CONST_BITS, tmp3,
299               VP9_DCT_CONST_BITS, tmp0, tmp1, tmp2, tmp3);
300     DUP4_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, zero, tmp2, zero, tmp3,
301               s0, s1, s2, s3);
302     LSX_BUTTERFLY_4_H(s0, s1, s2, s3, m0, m1, m2, m3);
303 
304     /* stage3 */
305     s0 = __lsx_vilvl_h(s6, s5);
306 
307     k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
308     DUP2_ARG2(__lsx_vdp2_w_h, s0, k1, s0, k0, tmp0, tmp1);
309     DUP2_ARG2(__lsx_vsrari_w, tmp0, VP9_DCT_CONST_BITS, tmp1,
310               VP9_DCT_CONST_BITS, tmp0, tmp1);
311     DUP2_ARG2(__lsx_vpickev_h, zero, tmp0, zero, tmp1, s2, s3);
312 
313     /* stage4 */
314     LSX_BUTTERFLY_8_H(m0, m1, m2, m3, s4, s2, s3, s7,
315                       in0, in1, in2, in3, in4, in5, in6, in7);
316     VP9_ILVLTRANS4x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
317                        in0, in1, in2, in3, in4, in5, in6, in7);
318     VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
319                    in0, in1, in2, in3, in4, in5, in6, in7);
320 
321     /* final rounding (add 2^4, divide by 2^5) and shift */
322     DUP4_ARG2(__lsx_vsrari_h, in0 , 5, in1, 5, in2, 5, in3, 5,
323               in0, in1, in2, in3);
324     DUP4_ARG2(__lsx_vsrari_h, in4 , 5, in5, 5, in6, 5, in7, 5,
325               in4, in5, in6, in7);
326 
327     /* add block and store 8x8 */
328     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
329     dst += (4 * dst_stride);
330     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
331 }
332 
vp9_idct8x8_colcol_addblk_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)333 static void vp9_idct8x8_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
334                                           int32_t dst_stride)
335 {
336     __m128i in0, in1, in2, in3, in4, in5, in6, in7;
337     __m128i zero = __lsx_vldi(0);
338 
339     /* load vector elements of 8x8 block */
340     DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48,
341               in0, in1, in2, in3);
342     DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112,
343               in4, in5, in6, in7);
344     __lsx_vst(zero, input, 0);
345     __lsx_vst(zero, input, 16);
346     __lsx_vst(zero, input, 32);
347     __lsx_vst(zero, input, 48);
348     __lsx_vst(zero, input, 64);
349     __lsx_vst(zero, input, 80);
350     __lsx_vst(zero, input, 96);
351     __lsx_vst(zero, input, 112);
352     /* 1D idct8x8 */
353     VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
354                    in0, in1, in2, in3, in4, in5, in6, in7);
355     /* columns transform */
356     LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
357                        in0, in1, in2, in3, in4, in5, in6, in7);
358     /* 1D idct8x8 */
359     VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
360                    in0, in1, in2, in3, in4, in5, in6, in7);
361     /* final rounding (add 2^4, divide by 2^5) and shift */
362     DUP4_ARG2(__lsx_vsrari_h, in0, 5, in1, 5, in2, 5, in3, 5,
363               in0, in1, in2, in3);
364     DUP4_ARG2(__lsx_vsrari_h, in4, 5, in5, 5, in6, 5, in7, 5,
365               in4, in5, in6, in7);
366     /* add block and store 8x8 */
367     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
368     dst += (4 * dst_stride);
369     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
370 }
371 
vp9_idct16_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)372 static void vp9_idct16_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
373                                              int32_t dst_stride)
374 {
375     __m128i loc0, loc1, loc2, loc3;
376     __m128i reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
377     __m128i reg1, reg3, reg5, reg7, reg9, reg11, reg13, reg15;
378     __m128i tmp5, tmp6, tmp7;
379     __m128i zero = __lsx_vldi(0);
380     int32_t offset = dst_stride << 2;
381 
382     DUP4_ARG2(__lsx_vld, input, 32*0, input, 32*1, input, 32*2, input, 32*3,
383               reg0, reg1, reg2, reg3);
384     DUP4_ARG2(__lsx_vld, input, 32*4, input, 32*5, input, 32*6, input, 32*7,
385               reg4, reg5, reg6, reg7);
386     DUP4_ARG2(__lsx_vld, input, 32*8, input, 32*9, input, 32*10, input, 32*11,
387               reg8, reg9, reg10, reg11);
388     DUP4_ARG2(__lsx_vld, input, 32*12, input, 32*13, input, 32*14, input,
389               32*15, reg12, reg13, reg14, reg15);
390 
391     __lsx_vst(zero, input, 32*0);
392     __lsx_vst(zero, input, 32*1);
393     __lsx_vst(zero, input, 32*2);
394     __lsx_vst(zero, input, 32*3);
395     __lsx_vst(zero, input, 32*4);
396     __lsx_vst(zero, input, 32*5);
397     __lsx_vst(zero, input, 32*6);
398     __lsx_vst(zero, input, 32*7);
399     __lsx_vst(zero, input, 32*8);
400     __lsx_vst(zero, input, 32*9);
401     __lsx_vst(zero, input, 32*10);
402     __lsx_vst(zero, input, 32*11);
403     __lsx_vst(zero, input, 32*12);
404     __lsx_vst(zero, input, 32*13);
405     __lsx_vst(zero, input, 32*14);
406     __lsx_vst(zero, input, 32*15);
407 
408     VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
409     VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
410     LSX_BUTTERFLY_4_H(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
411     VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
412     VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
413     VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
414     LSX_BUTTERFLY_4_H(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
415 
416     reg0 = __lsx_vsub_h(reg2, loc1);
417     reg2 = __lsx_vadd_h(reg2, loc1);
418     reg12 = __lsx_vsub_h(reg14, loc0);
419     reg14 = __lsx_vadd_h(reg14, loc0);
420     reg4 = __lsx_vsub_h(reg6, loc3);
421     reg6 = __lsx_vadd_h(reg6, loc3);
422     reg8 = __lsx_vsub_h(reg10, loc2);
423     reg10 = __lsx_vadd_h(reg10, loc2);
424 
425     /* stage2 */
426     VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
427     VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
428 
429     reg9 = __lsx_vsub_h(reg1, loc2);
430     reg1 = __lsx_vadd_h(reg1, loc2);
431     reg7 = __lsx_vsub_h(reg15, loc3);
432     reg15 = __lsx_vadd_h(reg15, loc3);
433 
434     VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
435     VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
436     LSX_BUTTERFLY_4_H(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
437 
438     loc1 = __lsx_vadd_h(reg15, reg3);
439     reg3 = __lsx_vsub_h(reg15, reg3);
440     loc2 = __lsx_vadd_h(reg2, loc1);
441     reg15 = __lsx_vsub_h(reg2, loc1);
442 
443     loc1 = __lsx_vadd_h(reg1, reg13);
444     reg13 = __lsx_vsub_h(reg1, reg13);
445     loc0 = __lsx_vadd_h(reg0, loc1);
446     loc1 = __lsx_vsub_h(reg0, loc1);
447     tmp6 = loc0;
448     tmp7 = loc1;
449     reg0 = loc2;
450 
451     VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
452     VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg5), __lsx_vneg_h(reg11), cospi_8_64,
453                         cospi_24_64, reg5, reg11);
454 
455     loc0 = __lsx_vadd_h(reg9, reg5);
456     reg5 = __lsx_vsub_h(reg9, reg5);
457     reg2 = __lsx_vadd_h(reg6, loc0);
458     reg1 = __lsx_vsub_h(reg6, loc0);
459 
460     loc0 = __lsx_vadd_h(reg7, reg11);
461     reg11 = __lsx_vsub_h(reg7, reg11);
462     loc1 = __lsx_vadd_h(reg4, loc0);
463     loc2 = __lsx_vsub_h(reg4, loc0);
464     tmp5 = loc1;
465 
466     VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
467     LSX_BUTTERFLY_4_H(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
468 
469     reg10 = loc0;
470     reg11 = loc1;
471 
472     VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
473     LSX_BUTTERFLY_4_H(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
474     reg13 = loc2;
475 
476     /* Transpose and store the output */
477     reg12 = tmp5;
478     reg14 = tmp6;
479     reg3 = tmp7;
480 
481     DUP4_ARG2(__lsx_vsrari_h, reg0, 6, reg2, 6, reg4, 6, reg6, 6,
482               reg0, reg2, reg4, reg6);
483     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
484     dst += offset;
485     DUP4_ARG2(__lsx_vsrari_h, reg8, 6, reg10, 6, reg12, 6, reg14, 6,
486               reg8, reg10, reg12, reg14);
487     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
488     dst += offset;
489     DUP4_ARG2(__lsx_vsrari_h, reg3, 6, reg5, 6, reg11, 6, reg13, 6,
490               reg3, reg5, reg11, reg13);
491     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
492     dst += offset;
493     DUP4_ARG2(__lsx_vsrari_h, reg1, 6, reg7, 6, reg9, 6, reg15, 6,
494               reg1, reg7, reg9, reg15);
495     VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
496 }
497 
vp9_idct16_1d_columns_lsx(int16_t *input, int16_t *output)498 static void vp9_idct16_1d_columns_lsx(int16_t *input, int16_t *output)
499 {
500     __m128i loc0, loc1, loc2, loc3;
501     __m128i reg1, reg3, reg5, reg7, reg9, reg11, reg13, reg15;
502     __m128i reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
503     __m128i tmp5, tmp6, tmp7;
504     __m128i zero = __lsx_vldi(0);
505     int16_t *offset;
506 
507     DUP4_ARG2(__lsx_vld, input, 32*0, input, 32*1, input, 32*2, input, 32*3,
508               reg0, reg1, reg2, reg3);
509     DUP4_ARG2(__lsx_vld, input, 32*4, input, 32*5, input, 32*6, input, 32*7,
510               reg4, reg5, reg6, reg7);
511     DUP4_ARG2(__lsx_vld, input, 32*8, input, 32*9, input, 32*10, input, 32*11,
512               reg8, reg9, reg10, reg11);
513     DUP4_ARG2(__lsx_vld, input, 32*12, input, 32*13, input, 32*14, input,
514               32*15, reg12, reg13, reg14, reg15);
515 
516     __lsx_vst(zero, input, 32*0);
517     __lsx_vst(zero, input, 32*1);
518     __lsx_vst(zero, input, 32*2);
519     __lsx_vst(zero, input, 32*3);
520     __lsx_vst(zero, input, 32*4);
521     __lsx_vst(zero, input, 32*5);
522     __lsx_vst(zero, input, 32*6);
523     __lsx_vst(zero, input, 32*7);
524     __lsx_vst(zero, input, 32*8);
525     __lsx_vst(zero, input, 32*9);
526     __lsx_vst(zero, input, 32*10);
527     __lsx_vst(zero, input, 32*11);
528     __lsx_vst(zero, input, 32*12);
529     __lsx_vst(zero, input, 32*13);
530     __lsx_vst(zero, input, 32*14);
531     __lsx_vst(zero, input, 32*15);
532 
533     VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
534     VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
535     LSX_BUTTERFLY_4_H(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
536     VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
537     VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
538     VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
539     LSX_BUTTERFLY_4_H(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
540 
541     reg0 = __lsx_vsub_h(reg2, loc1);
542     reg2 = __lsx_vadd_h(reg2, loc1);
543     reg12 = __lsx_vsub_h(reg14, loc0);
544     reg14 = __lsx_vadd_h(reg14, loc0);
545     reg4 = __lsx_vsub_h(reg6, loc3);
546     reg6 = __lsx_vadd_h(reg6, loc3);
547     reg8 = __lsx_vsub_h(reg10, loc2);
548     reg10 = __lsx_vadd_h(reg10, loc2);
549 
550     /* stage2 */
551     VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
552     VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
553 
554     reg9 = __lsx_vsub_h(reg1, loc2);
555     reg1 = __lsx_vadd_h(reg1, loc2);
556     reg7 = __lsx_vsub_h(reg15, loc3);
557     reg15 = __lsx_vadd_h(reg15, loc3);
558 
559     VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
560     VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
561     LSX_BUTTERFLY_4_H(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
562 
563     loc1 = __lsx_vadd_h(reg15, reg3);
564     reg3 = __lsx_vsub_h(reg15, reg3);
565     loc2 = __lsx_vadd_h(reg2, loc1);
566     reg15 = __lsx_vsub_h(reg2, loc1);
567 
568     loc1 = __lsx_vadd_h(reg1, reg13);
569     reg13 = __lsx_vsub_h(reg1, reg13);
570     loc0 = __lsx_vadd_h(reg0, loc1);
571     loc1 = __lsx_vsub_h(reg0, loc1);
572     tmp6 = loc0;
573     tmp7 = loc1;
574     reg0 = loc2;
575 
576     VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
577     VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg5), __lsx_vneg_h(reg11), cospi_8_64,
578                         cospi_24_64, reg5, reg11);
579 
580     loc0 = __lsx_vadd_h(reg9, reg5);
581     reg5 = __lsx_vsub_h(reg9, reg5);
582     reg2 = __lsx_vadd_h(reg6, loc0);
583     reg1 = __lsx_vsub_h(reg6, loc0);
584 
585     loc0 = __lsx_vadd_h(reg7, reg11);
586     reg11 = __lsx_vsub_h(reg7, reg11);
587     loc1 = __lsx_vadd_h(reg4, loc0);
588     loc2 = __lsx_vsub_h(reg4, loc0);
589 
590     tmp5 = loc1;
591 
592     VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
593     LSX_BUTTERFLY_4_H(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
594 
595     reg10 = loc0;
596     reg11 = loc1;
597 
598     VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
599     LSX_BUTTERFLY_4_H(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
600     reg13 = loc2;
601 
602     /* Transpose and store the output */
603     reg12 = tmp5;
604     reg14 = tmp6;
605     reg3 = tmp7;
606 
607     /* transpose block */
608     LSX_TRANSPOSE8x8_H(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
609                        reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
610 
611     __lsx_vst(reg0, output, 32*0);
612     __lsx_vst(reg2, output, 32*1);
613     __lsx_vst(reg4, output, 32*2);
614     __lsx_vst(reg6, output, 32*3);
615     __lsx_vst(reg8, output, 32*4);
616     __lsx_vst(reg10, output, 32*5);
617     __lsx_vst(reg12, output, 32*6);
618     __lsx_vst(reg14, output, 32*7);
619 
620     /* transpose block */
621     LSX_TRANSPOSE8x8_H(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
622                        reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
623 
624     offset = output + 8;
625     __lsx_vst(reg3, offset, 32*0);
626     __lsx_vst(reg13, offset, 32*1);
627     __lsx_vst(reg11, offset, 32*2);
628     __lsx_vst(reg5, offset, 32*3);
629 
630     offset = output + 8 + 4 * 16;
631     __lsx_vst(reg7, offset, 32*0);
632     __lsx_vst(reg9, offset, 32*1);
633     __lsx_vst(reg1, offset, 32*2);
634     __lsx_vst(reg15, offset, 32*3);
635 }
636 
vp9_idct16x16_1_add_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)637 static void vp9_idct16x16_1_add_lsx(int16_t *input, uint8_t *dst,
638                                     int32_t dst_stride)
639 {
640     uint8_t i;
641     int16_t out;
642     __m128i vec, res0, res1, res2, res3, res4, res5, res6, res7;
643     __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
644     int32_t stride2 = dst_stride << 1;
645     int32_t stride3 = stride2 + dst_stride;
646     int32_t stride4 = stride2 << 1;
647 
648     out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
649     out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
650     out = ROUND_POWER_OF_TWO(out, 6);
651     input[0] = 0;
652     vec = __lsx_vreplgr2vr_h(out);
653 
654     for (i = 4; i--;) {
655         dst0 = __lsx_vld(dst, 0);
656         DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, stride2, dst1, dst2);
657         dst3 = __lsx_vldx(dst, stride3);
658         VP9_UNPCK_UB_SH(dst0, res4, res0);
659         VP9_UNPCK_UB_SH(dst1, res5, res1);
660         VP9_UNPCK_UB_SH(dst2, res6, res2);
661         VP9_UNPCK_UB_SH(dst3, res7, res3);
662         DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec,
663                   res0, res1, res2, res3);
664         DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec,
665                   res4, res5, res6, res7);
666         DUP4_ARG1(__lsx_vclip255_h, res0, res1, res2, res3,
667                   res0, res1, res2, res3);
668         DUP4_ARG1(__lsx_vclip255_h, res4, res5, res6, res7,
669                   res4, res5, res6, res7);
670         DUP4_ARG2(__lsx_vpickev_b, res4, res0, res5, res1, res6,
671                   res2, res7, res3, tmp0, tmp1, tmp2, tmp3);
672         __lsx_vst(tmp0, dst, 0);
673         __lsx_vstx(tmp1, dst, dst_stride);
674         __lsx_vstx(tmp2, dst, stride2);
675         __lsx_vstx(tmp3, dst, stride3);
676         dst += stride4;
677     }
678 }
679 
vp9_idct16x16_10_colcol_addblk_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)680 static void vp9_idct16x16_10_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
681                                                int32_t dst_stride)
682 {
683     int32_t i;
684     int16_t out_arr[16 * 16] ALLOC_ALIGNED(16);
685     int16_t *out = out_arr;
686     __m128i zero = __lsx_vldi(0);
687 
688     /* transform rows */
689     vp9_idct16_1d_columns_lsx(input, out);
690 
691     /* short case just considers top 4 rows as valid output */
692     out += 4 * 16;
693     for (i = 3; i--;) {
694         __lsx_vst(zero, out, 0);
695         __lsx_vst(zero, out, 16);
696         __lsx_vst(zero, out, 32);
697         __lsx_vst(zero, out, 48);
698         __lsx_vst(zero, out, 64);
699         __lsx_vst(zero, out, 80);
700         __lsx_vst(zero, out, 96);
701         __lsx_vst(zero, out, 112);
702         out += 64;
703     }
704 
705     out = out_arr;
706 
707     /* transform columns */
708     for (i = 0; i < 2; i++) {
709         /* process 8 * 16 block */
710         vp9_idct16_1d_columns_addblk_lsx((out + (i << 3)), (dst + (i << 3)),
711                                          dst_stride);
712     }
713 }
714 
vp9_idct16x16_colcol_addblk_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)715 static void vp9_idct16x16_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
716                                             int32_t dst_stride)
717 {
718     int32_t i;
719     int16_t out_arr[16 * 16] ALLOC_ALIGNED(16);
720     int16_t *out = out_arr;
721 
722     /* transform rows */
723     for (i = 0; i < 2; i++) {
724         /* process 8 * 16 block */
725         vp9_idct16_1d_columns_lsx((input + (i << 3)), (out + (i << 7)));
726     }
727 
728     /* transform columns */
729     for (i = 0; i < 2; i++) {
730         /* process 8 * 16 block */
731         vp9_idct16_1d_columns_addblk_lsx((out + (i << 3)), (dst + (i << 3)),
732                                          dst_stride);
733     }
734 }
735 
vp9_idct_butterfly_transpose_store(int16_t *tmp_buf, int16_t *tmp_eve_buf, int16_t *tmp_odd_buf, int16_t *dst)736 static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf,
737                                                int16_t *tmp_eve_buf,
738                                                int16_t *tmp_odd_buf,
739                                                int16_t *dst)
740 {
741     __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
742     __m128i m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
743 
744     /* FINAL BUTTERFLY : Dependency on Even & Odd */
745     vec0 = __lsx_vld(tmp_odd_buf, 0);
746     vec1 = __lsx_vld(tmp_odd_buf, 9 * 16);
747     vec2 = __lsx_vld(tmp_odd_buf, 14 * 16);
748     vec3 = __lsx_vld(tmp_odd_buf, 6 * 16);
749     loc0 = __lsx_vld(tmp_eve_buf, 0);
750     loc1 = __lsx_vld(tmp_eve_buf, 8 * 16);
751     loc2 = __lsx_vld(tmp_eve_buf, 4 * 16);
752     loc3 = __lsx_vld(tmp_eve_buf, 12 * 16);
753 
754     DUP4_ARG2(__lsx_vadd_h,loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
755               m0, m4, m2, m6);
756 
757     #define SUB(a, b) __lsx_vsub_h(a, b)
758 
759     __lsx_vst(SUB(loc0, vec3), tmp_buf, 31 * 16);
760     __lsx_vst(SUB(loc1, vec2), tmp_buf, 23 * 16);
761     __lsx_vst(SUB(loc2, vec1), tmp_buf, 27 * 16);
762     __lsx_vst(SUB(loc3, vec0), tmp_buf, 19 * 16);
763 
764     /* Load 8 & Store 8 */
765     vec0 = __lsx_vld(tmp_odd_buf, 4 * 16);
766     vec1 = __lsx_vld(tmp_odd_buf, 13 * 16);
767     vec2 = __lsx_vld(tmp_odd_buf, 10 * 16);
768     vec3 = __lsx_vld(tmp_odd_buf, 3 * 16);
769     loc0 = __lsx_vld(tmp_eve_buf, 2 * 16);
770     loc1 = __lsx_vld(tmp_eve_buf, 10 * 16);
771     loc2 = __lsx_vld(tmp_eve_buf, 6 * 16);
772     loc3 = __lsx_vld(tmp_eve_buf, 14 * 16);
773 
774     DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
775               m1, m5, m3, m7);
776 
777     __lsx_vst(SUB(loc0, vec3), tmp_buf, 29 * 16);
778     __lsx_vst(SUB(loc1, vec2), tmp_buf, 21 * 16);
779     __lsx_vst(SUB(loc2, vec1), tmp_buf, 25 * 16);
780     __lsx_vst(SUB(loc3, vec0), tmp_buf, 17 * 16);
781 
782     /* Load 8 & Store 8 */
783     vec0 = __lsx_vld(tmp_odd_buf, 2 * 16);
784     vec1 = __lsx_vld(tmp_odd_buf, 11 * 16);
785     vec2 = __lsx_vld(tmp_odd_buf, 12 * 16);
786     vec3 = __lsx_vld(tmp_odd_buf, 7 * 16);
787     loc0 = __lsx_vld(tmp_eve_buf, 1 * 16);
788     loc1 = __lsx_vld(tmp_eve_buf, 9 * 16);
789     loc2 = __lsx_vld(tmp_eve_buf, 5 * 16);
790     loc3 = __lsx_vld(tmp_eve_buf, 13 * 16);
791 
792     DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
793               n0, n4, n2, n6);
794 
795     __lsx_vst(SUB(loc0, vec3), tmp_buf, 30 * 16);
796     __lsx_vst(SUB(loc1, vec2), tmp_buf, 22 * 16);
797     __lsx_vst(SUB(loc2, vec1), tmp_buf, 26 * 16);
798     __lsx_vst(SUB(loc3, vec0), tmp_buf, 18 * 16);
799 
800     /* Load 8 & Store 8 */
801     vec0 = __lsx_vld(tmp_odd_buf, 5 * 16);
802     vec1 = __lsx_vld(tmp_odd_buf, 15 * 16);
803     vec2 = __lsx_vld(tmp_odd_buf, 8 * 16);
804     vec3 = __lsx_vld(tmp_odd_buf, 1 * 16);
805     loc0 = __lsx_vld(tmp_eve_buf, 3 * 16);
806     loc1 = __lsx_vld(tmp_eve_buf, 11 * 16);
807     loc2 = __lsx_vld(tmp_eve_buf, 7 * 16);
808     loc3 = __lsx_vld(tmp_eve_buf, 15 * 16);
809 
810     DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
811               n1, n5, n3, n7);
812 
813     __lsx_vst(SUB(loc0, vec3), tmp_buf, 28 * 16);
814     __lsx_vst(SUB(loc1, vec2), tmp_buf, 20 * 16);
815     __lsx_vst(SUB(loc2, vec1), tmp_buf, 24 * 16);
816     __lsx_vst(SUB(loc3, vec0), tmp_buf, 16 * 16);
817 
818     /* Transpose : 16 vectors */
819     /* 1st & 2nd 8x8 */
820     LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3,
821                        m0, n0, m1, n1, m2, n2, m3, n3);
822     __lsx_vst(m0, dst, 0);
823     __lsx_vst(n0, dst, 32 * 2);
824     __lsx_vst(m1, dst, 32 * 4);
825     __lsx_vst(n1, dst, 32 * 6);
826     __lsx_vst(m2, dst, 32 * 8);
827     __lsx_vst(n2, dst, 32 * 10);
828     __lsx_vst(m3, dst, 32 * 12);
829     __lsx_vst(n3, dst, 32 * 14);
830 
831     LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7,
832                        m4, n4, m5, n5, m6, n6, m7, n7);
833 
834     __lsx_vst(m4, dst, 16);
835     __lsx_vst(n4, dst, 16 + 32 * 2);
836     __lsx_vst(m5, dst, 16 + 32 * 4);
837     __lsx_vst(n5, dst, 16 + 32 * 6);
838     __lsx_vst(m6, dst, 16 + 32 * 8);
839     __lsx_vst(n6, dst, 16 + 32 * 10);
840     __lsx_vst(m7, dst, 16 + 32 * 12);
841     __lsx_vst(n7, dst, 16 + 32 * 14);
842 
843     /* 3rd & 4th 8x8 */
844     DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 16, tmp_buf, 16 * 17,
845               tmp_buf, 16 * 18, tmp_buf, 16 * 19, m0, n0, m1, n1);
846     DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 20, tmp_buf, 16 * 21,
847               tmp_buf, 16 * 22, tmp_buf, 16 * 23, m2, n2, m3, n3);
848 
849     DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 24, tmp_buf, 16 * 25,
850               tmp_buf, 16 * 26, tmp_buf, 16 * 27, m4, n4, m5, n5);
851     DUP4_ARG2(__lsx_vld, tmp_buf, 16 * 28, tmp_buf, 16 * 29,
852               tmp_buf, 16 * 30, tmp_buf, 16 * 31, m6, n6, m7, n7);
853 
854     LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3,
855                        m0, n0, m1, n1, m2, n2, m3, n3);
856 
857     __lsx_vst(m0, dst, 32);
858     __lsx_vst(n0, dst, 32 + 32 * 2);
859     __lsx_vst(m1, dst, 32 + 32 * 4);
860     __lsx_vst(n1, dst, 32 + 32 * 6);
861     __lsx_vst(m2, dst, 32 + 32 * 8);
862     __lsx_vst(n2, dst, 32 + 32 * 10);
863     __lsx_vst(m3, dst, 32 + 32 * 12);
864     __lsx_vst(n3, dst, 32 + 32 * 14);
865 
866     LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7,
867                        m4, n4, m5, n5, m6, n6, m7, n7);
868 
869     __lsx_vst(m4, dst, 48);
870     __lsx_vst(n4, dst, 48 + 32 * 2);
871     __lsx_vst(m5, dst, 48 + 32 * 4);
872     __lsx_vst(n5, dst, 48 + 32 * 6);
873     __lsx_vst(m6, dst, 48 + 32 * 8);
874     __lsx_vst(n6, dst, 48 + 32 * 10);
875     __lsx_vst(m7, dst, 48 + 32 * 12);
876     __lsx_vst(n7, dst, 48 + 32 * 14);
877 }
878 
vp9_idct8x32_column_even_process_store(int16_t *tmp_buf, int16_t *tmp_eve_buf)879 static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
880                                                    int16_t *tmp_eve_buf)
881 {
882     __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
883     __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
884     __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
885     __m128i zero = __lsx_vldi(0);
886 
887     /* Even stage 1 */
888     DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 32 * 8,
889               tmp_buf, 32 * 16, tmp_buf, 32 * 24, reg0, reg1, reg2, reg3);
890     DUP4_ARG2(__lsx_vld, tmp_buf, 32 * 32, tmp_buf, 32 * 40,
891               tmp_buf, 32 * 48, tmp_buf, 32 * 56, reg4, reg5, reg6, reg7);
892 
893     __lsx_vst(zero, tmp_buf, 0);
894     __lsx_vst(zero, tmp_buf, 32 * 8);
895     __lsx_vst(zero, tmp_buf, 32 * 16);
896     __lsx_vst(zero, tmp_buf, 32 * 24);
897     __lsx_vst(zero, tmp_buf, 32 * 32);
898     __lsx_vst(zero, tmp_buf, 32 * 40);
899     __lsx_vst(zero, tmp_buf, 32 * 48);
900     __lsx_vst(zero, tmp_buf, 32 * 56);
901 
902     tmp_buf += (2 * 32);
903 
904     VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
905     VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
906     LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
907     VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
908 
909     loc1 = vec3;
910     loc0 = vec1;
911 
912     VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
913     VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
914     LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
915     LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
916     LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
917 
918     /* Even stage 2 */
919     /* Load 8 */
920     DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 32 * 8,
921               tmp_buf, 32 * 16, tmp_buf, 32 * 24, reg0, reg1, reg2, reg3);
922     DUP4_ARG2(__lsx_vld, tmp_buf, 32 * 32, tmp_buf, 32 * 40,
923               tmp_buf, 32 * 48, tmp_buf, 32 * 56, reg4, reg5, reg6, reg7);
924 
925     __lsx_vst(zero, tmp_buf, 0);
926     __lsx_vst(zero, tmp_buf, 32 * 8);
927     __lsx_vst(zero, tmp_buf, 32 * 16);
928     __lsx_vst(zero, tmp_buf, 32 * 24);
929     __lsx_vst(zero, tmp_buf, 32 * 32);
930     __lsx_vst(zero, tmp_buf, 32 * 40);
931     __lsx_vst(zero, tmp_buf, 32 * 48);
932     __lsx_vst(zero, tmp_buf, 32 * 56);
933 
934     VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
935     VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
936     VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
937     VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
938 
939     vec0 = __lsx_vadd_h(reg0, reg4);
940     reg0 = __lsx_vsub_h(reg0, reg4);
941     reg4 = __lsx_vadd_h(reg6, reg2);
942     reg6 = __lsx_vsub_h(reg6, reg2);
943     reg2 = __lsx_vadd_h(reg1, reg5);
944     reg1 = __lsx_vsub_h(reg1, reg5);
945     reg5 = __lsx_vadd_h(reg7, reg3);
946     reg7 = __lsx_vsub_h(reg7, reg3);
947     reg3 = vec0;
948 
949     vec1 = reg2;
950     reg2 = __lsx_vadd_h(reg3, reg4);
951     reg3 = __lsx_vsub_h(reg3, reg4);
952     reg4 = __lsx_vsub_h(reg5, vec1);
953     reg5 = __lsx_vadd_h(reg5, vec1);
954 
955     VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
956     VP9_DOTP_CONST_PAIR(__lsx_vneg_h(reg6), reg1, cospi_24_64, cospi_8_64,
957                         reg6, reg1);
958 
959     vec0 = __lsx_vsub_h(reg0, reg6);
960     reg0 = __lsx_vadd_h(reg0, reg6);
961     vec1 = __lsx_vsub_h(reg7, reg1);
962     reg7 = __lsx_vadd_h(reg7, reg1);
963 
964     VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
965     VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
966 
967     /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
968     /* Store 8 */
969     LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
970     __lsx_vst(loc1, tmp_eve_buf, 0);
971     __lsx_vst(loc3, tmp_eve_buf, 16);
972     __lsx_vst(loc2, tmp_eve_buf, 14 * 16);
973     __lsx_vst(loc0, tmp_eve_buf, 14 * 16 + 16);
974     LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
975     __lsx_vst(loc1, tmp_eve_buf, 2 * 16);
976     __lsx_vst(loc3, tmp_eve_buf, 2 * 16 + 16);
977     __lsx_vst(loc2, tmp_eve_buf, 12 * 16);
978     __lsx_vst(loc0, tmp_eve_buf, 12 * 16 + 16);
979 
980     /* Store 8 */
981     LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
982     __lsx_vst(loc1, tmp_eve_buf, 4 * 16);
983     __lsx_vst(loc3, tmp_eve_buf, 4 * 16 + 16);
984     __lsx_vst(loc2, tmp_eve_buf, 10 * 16);
985     __lsx_vst(loc0, tmp_eve_buf, 10 * 16 + 16);
986 
987     LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
988     __lsx_vst(loc1, tmp_eve_buf, 6 * 16);
989     __lsx_vst(loc3, tmp_eve_buf, 6 * 16 + 16);
990     __lsx_vst(loc2, tmp_eve_buf, 8 * 16);
991     __lsx_vst(loc0, tmp_eve_buf, 8 * 16 + 16);
992 }
993 
vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, int16_t *tmp_odd_buf)994 static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
995                                                   int16_t *tmp_odd_buf)
996 {
997     __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
998     __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
999     __m128i zero = __lsx_vldi(0);
1000 
1001     /* Odd stage 1 */
1002     reg0 = __lsx_vld(tmp_buf, 64);
1003     reg1 = __lsx_vld(tmp_buf, 7 * 64);
1004     reg2 = __lsx_vld(tmp_buf, 9 * 64);
1005     reg3 = __lsx_vld(tmp_buf, 15 * 64);
1006     reg4 = __lsx_vld(tmp_buf, 17 * 64);
1007     reg5 = __lsx_vld(tmp_buf, 23 * 64);
1008     reg6 = __lsx_vld(tmp_buf, 25 * 64);
1009     reg7 = __lsx_vld(tmp_buf, 31 * 64);
1010 
1011     __lsx_vst(zero, tmp_buf, 64);
1012     __lsx_vst(zero, tmp_buf, 7 * 64);
1013     __lsx_vst(zero, tmp_buf, 9 * 64);
1014     __lsx_vst(zero, tmp_buf, 15 * 64);
1015     __lsx_vst(zero, tmp_buf, 17 * 64);
1016     __lsx_vst(zero, tmp_buf, 23 * 64);
1017     __lsx_vst(zero, tmp_buf, 25 * 64);
1018     __lsx_vst(zero, tmp_buf, 31 * 64);
1019 
1020     VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
1021     VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
1022     VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
1023     VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
1024 
1025     vec0 = __lsx_vadd_h(reg0, reg3);
1026     reg0 = __lsx_vsub_h(reg0, reg3);
1027     reg3 = __lsx_vadd_h(reg7, reg4);
1028     reg7 = __lsx_vsub_h(reg7, reg4);
1029     reg4 = __lsx_vadd_h(reg1, reg2);
1030     reg1 = __lsx_vsub_h(reg1, reg2);
1031     reg2 = __lsx_vadd_h(reg6, reg5);
1032     reg6 = __lsx_vsub_h(reg6, reg5);
1033     reg5 = vec0;
1034 
1035     /* 4 Stores */
1036     DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
1037     __lsx_vst(vec0, tmp_odd_buf, 4 * 16);
1038     __lsx_vst(vec1, tmp_odd_buf, 4 * 16 + 16);
1039     DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
1040     VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
1041     __lsx_vst(vec0, tmp_odd_buf, 0);
1042     __lsx_vst(vec1, tmp_odd_buf, 16);
1043 
1044     /* 4 Stores */
1045     VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
1046     VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
1047     LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
1048     __lsx_vst(vec0, tmp_odd_buf, 6 * 16);
1049     __lsx_vst(vec1, tmp_odd_buf, 6 * 16 + 16);
1050     VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
1051     __lsx_vst(vec2, tmp_odd_buf, 2 * 16);
1052     __lsx_vst(vec3, tmp_odd_buf, 2 * 16 + 16);
1053 
1054     /* Odd stage 2 */
1055     /* 8 loads */
1056     reg0 = __lsx_vld(tmp_buf, 3 * 64);
1057     reg1 = __lsx_vld(tmp_buf, 5 * 64);
1058     reg2 = __lsx_vld(tmp_buf, 11 * 64);
1059     reg3 = __lsx_vld(tmp_buf, 13 * 64);
1060     reg4 = __lsx_vld(tmp_buf, 19 * 64);
1061     reg5 = __lsx_vld(tmp_buf, 21 * 64);
1062     reg6 = __lsx_vld(tmp_buf, 27 * 64);
1063     reg7 = __lsx_vld(tmp_buf, 29 * 64);
1064 
1065     __lsx_vst(zero, tmp_buf, 3 * 64);
1066     __lsx_vst(zero, tmp_buf, 5 * 64);
1067     __lsx_vst(zero, tmp_buf, 11 * 64);
1068     __lsx_vst(zero, tmp_buf, 13 * 64);
1069     __lsx_vst(zero, tmp_buf, 19 * 64);
1070     __lsx_vst(zero, tmp_buf, 21 * 64);
1071     __lsx_vst(zero, tmp_buf, 27 * 64);
1072     __lsx_vst(zero, tmp_buf, 29 * 64);
1073 
1074     VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
1075     VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
1076     VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
1077     VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
1078 
1079     /* 4 Stores */
1080     DUP4_ARG2(__lsx_vsub_h,reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
1081               vec0, vec1, vec2, vec3);
1082     VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
1083     VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
1084     LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
1085     __lsx_vst(vec0, tmp_odd_buf, 12 * 16);
1086     __lsx_vst(vec1, tmp_odd_buf, 12 * 16 + 3 * 16);
1087     VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
1088     __lsx_vst(vec0, tmp_odd_buf, 10 * 16);
1089     __lsx_vst(vec1, tmp_odd_buf, 10 * 16 + 16);
1090 
1091     /* 4 Stores */
1092     DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7,
1093               vec0, vec1, vec2, vec3);
1094     LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
1095     __lsx_vst(reg0, tmp_odd_buf, 13 * 16);
1096     __lsx_vst(reg1, tmp_odd_buf, 13 * 16 + 16);
1097     VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64,
1098                         reg0, reg1);
1099     __lsx_vst(reg0, tmp_odd_buf, 8 * 16);
1100     __lsx_vst(reg1, tmp_odd_buf, 8 * 16 + 16);
1101 
1102     /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
1103     /* Load 8 & Store 8 */
1104     DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16,
1105               tmp_odd_buf, 32, tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
1106     DUP4_ARG2(__lsx_vld, tmp_odd_buf, 8 * 16, tmp_odd_buf, 8 * 16 + 16,
1107               tmp_odd_buf, 8 * 16 + 32, tmp_odd_buf, 8 * 16 + 48,
1108               reg4, reg5, reg6, reg7);
1109 
1110     DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
1111                   loc0, loc1, loc2, loc3);
1112     __lsx_vst(loc0, tmp_odd_buf, 0);
1113     __lsx_vst(loc1, tmp_odd_buf, 16);
1114     __lsx_vst(loc2, tmp_odd_buf, 32);
1115     __lsx_vst(loc3, tmp_odd_buf, 48);
1116     DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
1117     VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
1118 
1119     DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
1120     VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
1121     __lsx_vst(loc0, tmp_odd_buf, 8 * 16);
1122     __lsx_vst(loc1, tmp_odd_buf, 8 * 16 + 16);
1123     __lsx_vst(loc2, tmp_odd_buf, 8 * 16 + 32);
1124     __lsx_vst(loc3, tmp_odd_buf, 8 * 16 + 48);
1125 
1126     /* Load 8 & Store 8 */
1127     DUP4_ARG2(__lsx_vld, tmp_odd_buf, 4 * 16, tmp_odd_buf, 4 * 16 + 16,
1128               tmp_odd_buf, 4 * 16 + 32, tmp_odd_buf, 4 * 16 + 48,
1129               reg1, reg2, reg0, reg3);
1130     DUP4_ARG2(__lsx_vld, tmp_odd_buf, 12 * 16, tmp_odd_buf, 12 * 16 + 16,
1131               tmp_odd_buf, 12 * 16 + 32, tmp_odd_buf, 12 * 16 + 48,
1132               reg4, reg5, reg6, reg7);
1133 
1134     DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
1135               loc0, loc1, loc2, loc3);
1136     __lsx_vst(loc0, tmp_odd_buf, 4 * 16);
1137     __lsx_vst(loc1, tmp_odd_buf, 4 * 16 + 16);
1138     __lsx_vst(loc2, tmp_odd_buf, 4 * 16 + 32);
1139     __lsx_vst(loc3, tmp_odd_buf, 4 * 16 + 48);
1140 
1141     DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
1142     VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
1143 
1144     DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
1145     VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
1146     __lsx_vst(loc0, tmp_odd_buf, 12 * 16);
1147     __lsx_vst(loc1, tmp_odd_buf, 12 * 16 + 16);
1148     __lsx_vst(loc2, tmp_odd_buf, 12 * 16 + 32);
1149     __lsx_vst(loc3, tmp_odd_buf, 12 * 16 + 48);
1150 }
1151 
vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, int16_t *tmp_odd_buf, uint8_t *dst, int32_t dst_stride)1152 static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
1153                                                  int16_t *tmp_odd_buf,
1154                                                  uint8_t *dst,
1155                                                  int32_t dst_stride)
1156 {
1157     __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
1158     __m128i m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
1159 
1160     /* FINAL BUTTERFLY : Dependency on Even & Odd */
1161     vec0 = __lsx_vld(tmp_odd_buf, 0);
1162     vec1 = __lsx_vld(tmp_odd_buf, 9 * 16);
1163     vec2 = __lsx_vld(tmp_odd_buf, 14 * 16);
1164     vec3 = __lsx_vld(tmp_odd_buf, 6 * 16);
1165     loc0 = __lsx_vld(tmp_eve_buf, 0);
1166     loc1 = __lsx_vld(tmp_eve_buf, 8 * 16);
1167     loc2 = __lsx_vld(tmp_eve_buf, 4 * 16);
1168     loc3 = __lsx_vld(tmp_eve_buf, 12 * 16);
1169 
1170     DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1171               m0, m4, m2, m6);
1172     DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
1173     VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
1174 
1175     DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1176               m6, m2, m4, m0);
1177     DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
1178     VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
1179                         m0, m2, m4, m6);
1180 
1181     /* Load 8 & Store 8 */
1182     vec0 = __lsx_vld(tmp_odd_buf, 4 * 16);
1183     vec1 = __lsx_vld(tmp_odd_buf, 13 * 16);
1184     vec2 = __lsx_vld(tmp_odd_buf, 10 * 16);
1185     vec3 = __lsx_vld(tmp_odd_buf, 3 * 16);
1186     loc0 = __lsx_vld(tmp_eve_buf, 2 * 16);
1187     loc1 = __lsx_vld(tmp_eve_buf, 10 * 16);
1188     loc2 = __lsx_vld(tmp_eve_buf, 6 * 16);
1189     loc3 = __lsx_vld(tmp_eve_buf, 14 * 16);
1190 
1191     DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1192                m1, m5, m3, m7);
1193     DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
1194     VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
1195                         m1, m3, m5, m7);
1196 
1197     DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1198               m7, m3, m5, m1);
1199     DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
1200     VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
1201                         m1, m3, m5, m7);
1202 
1203     /* Load 8 & Store 8 */
1204     vec0 = __lsx_vld(tmp_odd_buf, 2 * 16);
1205     vec1 = __lsx_vld(tmp_odd_buf, 11 * 16);
1206     vec2 = __lsx_vld(tmp_odd_buf, 12 * 16);
1207     vec3 = __lsx_vld(tmp_odd_buf, 7 * 16);
1208     loc0 = __lsx_vld(tmp_eve_buf, 1 * 16);
1209     loc1 = __lsx_vld(tmp_eve_buf, 9 * 16);
1210     loc2 = __lsx_vld(tmp_eve_buf, 5 * 16);
1211     loc3 = __lsx_vld(tmp_eve_buf, 13 * 16);
1212 
1213     DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1214               n0, n4, n2, n6);
1215     DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
1216     VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
1217                         n0, n2, n4, n6);
1218     DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1219               n6, n2, n4, n0);
1220     DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
1221     VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
1222                         n0, n2, n4, n6);
1223 
1224     /* Load 8 & Store 8 */
1225     vec0 = __lsx_vld(tmp_odd_buf, 5 * 16);
1226     vec1 = __lsx_vld(tmp_odd_buf, 15 * 16);
1227     vec2 = __lsx_vld(tmp_odd_buf, 8 * 16);
1228     vec3 = __lsx_vld(tmp_odd_buf, 1 * 16);
1229     loc0 = __lsx_vld(tmp_eve_buf, 3 * 16);
1230     loc1 = __lsx_vld(tmp_eve_buf, 11 * 16);
1231     loc2 = __lsx_vld(tmp_eve_buf, 7 * 16);
1232     loc3 = __lsx_vld(tmp_eve_buf, 15 * 16);
1233 
1234     DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1235               n1, n5, n3, n7);
1236     DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
1237     VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
1238                         n1, n3, n5, n7);
1239     DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0,
1240               n7, n3, n5, n1);
1241     DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
1242     VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
1243                         n1, n3, n5, n7);
1244 }
1245 
vp9_idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)1246 static void vp9_idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
1247                                                int32_t dst_stride)
1248 {
1249     int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(16);
1250     int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(16);
1251 
1252     vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
1253     vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
1254     vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
1255                                          dst, dst_stride);
1256 }
1257 
vp9_idct8x32_1d_columns_lsx(int16_t *input, int16_t *output, int16_t *tmp_buf)1258 static void vp9_idct8x32_1d_columns_lsx(int16_t *input, int16_t *output,
1259                                         int16_t *tmp_buf)
1260 {
1261     int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(16);
1262     int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(16);
1263 
1264     vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
1265     vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
1266     vp9_idct_butterfly_transpose_store(tmp_buf, &tmp_eve_buf[0],
1267                                        &tmp_odd_buf[0], output);
1268 }
1269 
vp9_idct32x32_1_add_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)1270 static void vp9_idct32x32_1_add_lsx(int16_t *input, uint8_t *dst,
1271                                     int32_t dst_stride)
1272 {
1273     int32_t i;
1274     int16_t out;
1275     uint8_t *dst_tmp = dst + dst_stride;
1276     __m128i zero = __lsx_vldi(0);
1277     __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1278     __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
1279 
1280     out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
1281     out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
1282     out = ROUND_POWER_OF_TWO(out, 6);
1283     input[0] = 0;
1284 
1285     vec = __lsx_vreplgr2vr_h(out);
1286 
1287     for (i = 16; i--;) {
1288         DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
1289         DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst2, dst3);
1290 
1291         DUP4_ARG2(__lsx_vilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
1292                   res0, res1, res2, res3);
1293         DUP4_ARG2(__lsx_vilvh_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
1294                   res4, res5, res6, res7);
1295         DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec,
1296                   res0, res1, res2, res3);
1297         DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec,
1298                   res4, res5, res6, res7);
1299         DUP4_ARG1(__lsx_vclip255_h, res0, res1, res2, res3, res0, res1, res2, res3);
1300         DUP4_ARG1(__lsx_vclip255_h, res4, res5, res6, res7, res4, res5, res6, res7);
1301         DUP4_ARG2(__lsx_vpickev_b, res4, res0, res5, res1, res6, res2, res7, res3,
1302                   tmp0, tmp1, tmp2, tmp3);
1303 
1304         __lsx_vst(tmp0, dst, 0);
1305         __lsx_vst(tmp1, dst, 16);
1306         __lsx_vst(tmp2, dst_tmp, 0);
1307         __lsx_vst(tmp3, dst_tmp, 16);
1308         dst = dst_tmp + dst_stride;
1309         dst_tmp = dst + dst_stride;
1310     }
1311 }
1312 
vp9_idct32x32_34_colcol_addblk_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)1313 static void vp9_idct32x32_34_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
1314                                                int32_t dst_stride)
1315 {
1316     int32_t i;
1317     int16_t out_arr[32 * 32] ALLOC_ALIGNED(16);
1318     int16_t *out_ptr = out_arr;
1319     int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(16);
1320     __m128i zero = __lsx_vldi(0);
1321 
1322     for (i = 16; i--;) {
1323         __lsx_vst(zero, out_ptr, 0);
1324         __lsx_vst(zero, out_ptr, 16);
1325         __lsx_vst(zero, out_ptr, 32);
1326         __lsx_vst(zero, out_ptr, 48);
1327         __lsx_vst(zero, out_ptr, 64);
1328         __lsx_vst(zero, out_ptr, 80);
1329         __lsx_vst(zero, out_ptr, 96);
1330         __lsx_vst(zero, out_ptr, 112);
1331         out_ptr += 64;
1332     }
1333 
1334     out_ptr = out_arr;
1335 
1336     /* process 8*32 block */
1337     vp9_idct8x32_1d_columns_lsx(input, out_ptr, &tmp_buf[0]);
1338 
1339     /* transform columns */
1340     for (i = 0; i < 4; i++) {
1341         /* process 8*32 block */
1342         vp9_idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)),
1343                                            (dst + (i << 3)), dst_stride);
1344     }
1345 }
1346 
vp9_idct32x32_colcol_addblk_lsx(int16_t *input, uint8_t *dst, int32_t dst_stride)1347 static void vp9_idct32x32_colcol_addblk_lsx(int16_t *input, uint8_t *dst,
1348                                             int32_t dst_stride)
1349 {
1350     int32_t i;
1351     int16_t out_arr[32 * 32] ALLOC_ALIGNED(16);
1352     int16_t *out_ptr = out_arr;
1353     int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(16);
1354 
1355     /* transform rows */
1356     for (i = 0; i < 4; i++) {
1357         /* process 8*32 block */
1358         vp9_idct8x32_1d_columns_lsx((input + (i << 3)), (out_ptr + (i << 8)),
1359                                     &tmp_buf[0]);
1360     }
1361 
1362     /* transform columns */
1363     for (i = 0; i < 4; i++) {
1364         /* process 8*32 block */
1365         vp9_idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)),
1366                                            (dst + (i << 3)), dst_stride);
1367     }
1368 }
1369 
ff_idct_idct_8x8_add_lsx(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob)1370 void ff_idct_idct_8x8_add_lsx(uint8_t *dst, ptrdiff_t stride,
1371                               int16_t *block, int eob)
1372 {
1373     if (eob == 1) {
1374         vp9_idct8x8_1_add_lsx(block, dst, stride);
1375     }
1376     else if (eob <= 12) {
1377         vp9_idct8x8_12_colcol_addblk_lsx(block, dst, stride);
1378     }
1379     else {
1380         vp9_idct8x8_colcol_addblk_lsx(block, dst, stride);
1381     }
1382 }
1383 
ff_idct_idct_16x16_add_lsx(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob)1384 void ff_idct_idct_16x16_add_lsx(uint8_t *dst, ptrdiff_t stride,
1385                                 int16_t *block, int eob)
1386 {
1387     if (eob == 1) {
1388         /* DC only DCT coefficient. */
1389         vp9_idct16x16_1_add_lsx(block, dst, stride);
1390     }
1391     else if (eob <= 10) {
1392         vp9_idct16x16_10_colcol_addblk_lsx(block, dst, stride);
1393     }
1394     else {
1395         vp9_idct16x16_colcol_addblk_lsx(block, dst, stride);
1396     }
1397 }
1398 
ff_idct_idct_32x32_add_lsx(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob)1399 void ff_idct_idct_32x32_add_lsx(uint8_t *dst, ptrdiff_t stride,
1400                                 int16_t *block, int eob)
1401 {
1402     if (eob == 1) {
1403         vp9_idct32x32_1_add_lsx(block, dst, stride);
1404     }
1405     else if (eob <= 34) {
1406         vp9_idct32x32_34_colcol_addblk_lsx(block, dst, stride);
1407     }
1408     else {
1409         vp9_idct32x32_colcol_addblk_lsx(block, dst, stride);
1410     }
1411 }
1412