1/*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * Contributed by Hao Chen <chenhao@loongson.cn>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "vc1dsp_loongarch.h"
23#include "libavutil/loongarch/loongson_intrinsics.h"
24
25void ff_vc1_inv_trans_8x8_lasx(int16_t block[64])
26{
27    int32_t con_4    = 4;
28    int32_t con_64   = 64;
29    __m256i in0, in1, in2, in3;
30    __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4, t5, t6, t7, t8;
31    __m256i const_1  = {0x000c000c000c000c, 0x000c000c000c000c,
32                        0x000c000c000c000c, 0x000c000c000c000c};
33    __m256i const_2  = {0xfff4000cfff4000c, 0xfff4000cfff4000c,
34                        0xfff4000cfff4000c, 0xfff4000cfff4000c};
35    __m256i const_3  = {0x0006001000060010, 0x0006001000060010,
36                        0x0006001000060010, 0x0006001000060010};
37    __m256i const_4  = {0xfff00006fff00006, 0xfff00006fff00006,
38                        0xfff00006fff00006, 0xfff00006fff00006};
39    __m256i const_5  = {0x000f0010000f0010, 0x000f0010000f0010,
40                        0x000f0010000f0010, 0x000f0010000f0010};
41    __m256i const_6  = {0x0004000900040009, 0x0004000900040009,
42                        0x0004000900040009, 0x0004000900040009};
43    __m256i const_7  = {0xfffc000ffffc000f, 0xfffc000ffffc000f,
44                        0xfffc000ffffc000f, 0xfffc000ffffc000f};
45    __m256i const_8  = {0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0,
46                        0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0};
47    __m256i const_9  = {0xfff00009fff00009, 0xfff00009fff00009,
48                        0xfff00009fff00009, 0xfff00009fff00009};
49    __m256i const_10 = {0x000f0004000f0004, 0x000f0004000f0004,
50                        0x000f0004000f0004, 0x000f0004000f0004};
51    __m256i const_11 = {0xfff70004fff70004, 0xfff70004fff70004,
52                        0xfff70004fff70004, 0xfff70004fff70004};
53    __m256i const_12 = {0xfff0000ffff0000f, 0xfff0000ffff0000f,
54                        0xfff0000ffff0000f, 0xfff0000ffff0000f};
55
56    DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
57              in0, in1, in2, in3);
58    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
59              in0, in1, in2, in3);
60    /* first loops */
61    DUP2_ARG2(__lasx_xvilvl_h, in2, in0, in3, in1, temp0, temp1);
62    t2 = __lasx_xvreplgr2vr_w(con_4);
63    DUP2_ARG3(__lasx_xvdp2add_w_h, t2, temp0, const_1, t2, temp0,
64              const_2, t1, t2);
65    DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
66
67    t5 = __lasx_xvadd_w(t1, t3);
68    t6 = __lasx_xvadd_w(t2, t4);
69    t7 = __lasx_xvsub_w(t2, t4);
70    t8 = __lasx_xvsub_w(t1, t3);
71
72    DUP2_ARG2(__lasx_xvilvh_h, in1, in0, in3, in2, temp0, temp1);
73    temp2 = __lasx_xvdp2_w_h(const_5, temp0);
74    t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6);
75    temp2 = __lasx_xvdp2_w_h(const_7, temp0);
76    t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8);
77    temp2 = __lasx_xvdp2_w_h(const_9, temp0);
78    t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10);
79    temp2 = __lasx_xvdp2_w_h(const_11, temp0);
80    t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12);
81
82    DUP4_ARG2(__lasx_xvadd_w, t1, t5, t6, t2, t7, t3, t8, t4,
83              temp0, temp1, temp2, temp3);
84    DUP4_ARG2(__lasx_xvsub_w, t8, t4, t7, t3, t6, t2, t5, t1,
85              in0, in1, in2, in3);
86    DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3,
87              temp0, temp1, temp2, temp3);
88    DUP4_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in2, 3, in3, 3,
89              in0, in1, in2, in3);
90
91    /* second loops */
92    DUP4_ARG2(__lasx_xvpackev_h, temp1, temp0, temp3, temp2, in1, in0,
93              in3, in2, temp0, temp1, temp2, temp3);
94    DUP2_ARG2(__lasx_xvilvl_w, temp1, temp0, temp3, temp2, t1, t3);
95    DUP2_ARG2(__lasx_xvilvh_w, temp1, temp0, temp3, temp2, t2, t4);
96    DUP4_ARG3(__lasx_xvpermi_q, t3, t1, 0x20, t3, t1, 0x31, t4, t2, 0x20,
97              t4, t2, 0x31, in0, in1, in2, in3);
98    DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in3, in2, temp0, temp1);
99    t3    = __lasx_xvreplgr2vr_w(con_64);
100    DUP2_ARG3(__lasx_xvdp2add_w_h, t3, temp0, const_1, t3, temp0,
101              const_2, t1, t2);
102    DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
103
104    t5    = __lasx_xvadd_w(t1, t3);
105    t6    = __lasx_xvadd_w(t2, t4);
106    t7    = __lasx_xvsub_w(t2, t4);
107    t8    = __lasx_xvsub_w(t1, t3);
108
109    DUP2_ARG2(__lasx_xvilvh_h, in2, in0, in3, in1, temp0, temp1);
110    temp2 = __lasx_xvdp2_w_h(const_5, temp0);
111    t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6);
112    temp2 = __lasx_xvdp2_w_h(const_7, temp0);
113    t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8);
114    temp2 = __lasx_xvdp2_w_h(const_9, temp0);
115    t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10);
116    temp2 = __lasx_xvdp2_w_h(const_11, temp0);
117    t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12);
118
119    DUP4_ARG2(__lasx_xvadd_w, t5, t1, t6, t2, t7, t3, t8, t4,
120              temp0, temp1, temp2, temp3);
121    DUP4_ARG2(__lasx_xvsub_w, t8, t4, t7, t3, t6, t2, t5, t1,
122              in0, in1, in2, in3);
123    DUP4_ARG2(__lasx_xvaddi_wu, in0, 1, in1, 1, in2, 1, in3, 1,
124              in0, in1, in2, in3);
125    DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 7, temp3, temp2, 7,
126              in1, in0, 7, in3, in2, 7, t1, t2, t3, t4);
127    DUP4_ARG2(__lasx_xvpermi_d, t1, 0xD8, t2, 0xD8, t3, 0xD8, t4, 0xD8,
128              in0, in1, in2, in3);
129    __lasx_xvst(in0, block, 0);
130    __lasx_xvst(in1, block, 32);
131    __lasx_xvst(in2, block, 64);
132    __lasx_xvst(in3, block, 96);
133}
134
135void ff_vc1_inv_trans_8x8_dc_lasx(uint8_t *dest, ptrdiff_t stride,
136                                  int16_t *block)
137{
138    int dc = block[0];
139    ptrdiff_t stride2 = stride << 1;
140    ptrdiff_t stride3 = stride2 + stride;
141    uint8_t *dst = dest + (stride2 << 1);
142    __m256i in0, in1, in2, in3, in4, in5, in6, in7;
143    __m256i const_dc, temp0, temp1, temp2, temp3;
144    __m256i reg0, reg1, reg2, reg3;
145
146    dc = (3 * dc +  1) >> 1;
147    dc = (3 * dc + 16) >> 5;
148
149    const_dc = __lasx_xvreplgr2vr_h(dc);
150    DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2,
151              0, dest + stride3, 0, in0, in1, in2, in3);
152    DUP4_ARG2(__lasx_xvldrepl_d, dst, 0, dst + stride, 0, dst + stride2,
153              0, dst + stride3, 0, in4, in5, in6, in7);
154
155    DUP4_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, in5, in4, in7, in6,
156              temp0, temp1, temp2, temp3);
157    DUP4_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp2, temp3,
158              temp0, temp1, temp2, temp3);
159
160    DUP4_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, temp2,
161              const_dc, temp3, const_dc, reg0, reg1, reg2, reg3);
162    DUP2_ARG3(__lasx_xvssrarni_bu_h, reg1, reg0, 0, reg3, reg2, 0,
163              temp0, temp1);
164    __lasx_xvstelm_d(temp0, dest, 0, 0);
165    __lasx_xvstelm_d(temp0, dest + stride, 0, 2);
166    __lasx_xvstelm_d(temp0, dest + stride2, 0, 1);
167    __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
168    __lasx_xvstelm_d(temp1, dst, 0, 0);
169    __lasx_xvstelm_d(temp1, dst + stride, 0, 2);
170    __lasx_xvstelm_d(temp1, dst + stride2, 0, 1);
171    __lasx_xvstelm_d(temp1, dst + stride3, 0, 3);
172}
173
174void ff_vc1_inv_trans_8x4_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block)
175{
176    ptrdiff_t stride2 = stride << 1;
177    ptrdiff_t stride3 = stride2 + stride;
178    __m256i shift    = {0x0000000400000000, 0x0000000500000001,
179                        0x0000000600000002, 0x0000000700000003};
180    __m256i const_64 = {0x0000004000000040, 0x0000004000000040,
181                        0x0000004000000040, 0x0000004000000040};
182    __m256i const_1  = {0x00060010000C000C, 0x00060010000C000C,
183                        0x00060010000C000C, 0x00060010000C000C};
184    __m256i const_2  = {0xFFF00006FFF4000C, 0xFFF00006FFF4000C,
185                        0xFFF00006FFF4000C, 0xFFF00006FFF4000C};
186    __m256i const_3  = {0x0004000F00090010, 0x0004000F00090010,
187                        0x0004000F00090010, 0x0004000F00090010};
188    __m256i const_4  = {0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F,
189                        0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F};
190    __m256i const_5  = {0x000FFFF000040009, 0x000FFFF000040009,
191                        0x000FFFF000040009, 0x000FFFF000040009};
192    __m256i const_6  = {0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004,
193                        0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004};
194    __m256i const_7  = {0x0000000000000004, 0x0000000000000004,
195                        0x0000000000000004, 0x0000000000000004};
196    __m256i const_8  = {0x0011001100110011, 0x0011001100110011,
197                        0x0011001100110011, 0x0011001100110011};
198    __m256i const_9  = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011,
199                        0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011};
200    __m256i const_10 = {0x000A0016000A0016, 0x000A0016000A0016,
201                        0x000A0016000A0016, 0x000A0016000A0016};
202    __m256i const_11 = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6,
203                        0x0016FFF60016FFF6, 0x0016FFF60016FFF6};
204    __m256i in0, in1;
205    __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4;
206
207    DUP2_ARG2(__lasx_xvld, block, 0, block, 32, in0, in1);
208    /* first loops */
209    temp0 = __lasx_xvpermi_d(in0, 0xB1);
210    temp1 = __lasx_xvpermi_d(in1, 0xB1);
211    DUP2_ARG2(__lasx_xvilvl_h, temp0, in0, temp1, in1, temp0, temp1);
212    temp2 = __lasx_xvpickev_w(temp1, temp0);
213    temp3 = __lasx_xvpickod_w(temp1, temp0);
214
215    DUP2_ARG2(__lasx_xvdp2_w_h, temp2, const_1, temp2, const_2, temp0, temp1);
216    t1    = __lasx_xvadd_w(temp0, const_7);
217    t2    = __lasx_xvadd_w(temp1, const_7);
218    temp0 = __lasx_xvpickev_w(t2, t1);
219    temp1 = __lasx_xvpickod_w(t2, t1);
220    t3    = __lasx_xvadd_w(temp0, temp1);
221    t4    = __lasx_xvsub_w(temp0, temp1);
222    t4    = __lasx_xvpermi_d(t4, 0xB1);
223
224    DUP4_ARG2(__lasx_xvdp4_d_h, temp3, const_3, temp3, const_4, temp3,
225              const_5, temp3, const_6, t1, t2, temp0, temp1);
226    temp2 = __lasx_xvpickev_w(t2, t1);
227    temp3 = __lasx_xvpickev_w(temp1, temp0);
228
229    t1    = __lasx_xvadd_w(temp2, t3);
230    t2    = __lasx_xvadd_w(temp3, t4);
231    temp0 = __lasx_xvsub_w(t4, temp3);
232    temp1 = __lasx_xvsub_w(t3, temp2);
233    /* second loops */
234    DUP2_ARG3(__lasx_xvsrani_h_w, t2, t1, 3, temp1, temp0, 3, temp2, temp3);
235    temp3 = __lasx_xvshuf4i_h(temp3, 0x4E);
236    temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);
237    temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);
238    DUP2_ARG3(__lasx_xvdp2add_w_h, const_64, temp0, const_8, const_64, temp0,
239              const_9, t1, t2);
240    DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_10, temp1, const_11, t3, t4);
241    temp0 = __lasx_xvadd_w(t1, t3);
242    temp1 = __lasx_xvsub_w(t2, t4);
243    temp2 = __lasx_xvadd_w(t2, t4);
244    temp3 = __lasx_xvsub_w(t1, t3);
245    DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7,
246              t1, t2, t3, t4);
247
248    temp0 = __lasx_xvldrepl_d(dest, 0);
249    DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2, 0,
250              dest + stride3, 0, temp0, temp1, temp2, temp3);
251    DUP4_ARG1(__lasx_vext2xv_wu_bu, temp0, temp1, temp2, temp3,
252              temp0, temp1, temp2, temp3);
253    DUP4_ARG2(__lasx_xvadd_w, temp0, t1, temp1, t2, temp2, t3, temp3, t4,
254              t1, t2, t3, t4);
255    DUP4_ARG1(__lasx_xvclip255_w, t1, t2, t3, t4, t1, t2, t3, t4);
256    DUP2_ARG2(__lasx_xvpickev_h, t2, t1, t4, t3, temp0, temp1);
257    temp2 = __lasx_xvpickev_b(temp1, temp0);
258    temp0 = __lasx_xvperm_w(temp2, shift);
259    __lasx_xvstelm_d(temp0, dest, 0, 0);
260    __lasx_xvstelm_d(temp0, dest + stride, 0, 1);
261    __lasx_xvstelm_d(temp0, dest + stride2, 0, 2);
262    __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
263}
264
265void ff_vc1_inv_trans_8x4_dc_lasx(uint8_t *dest, ptrdiff_t stride,
266                                  int16_t *block)
267{
268    int dc = block[0];
269    ptrdiff_t stride2 = stride << 1;
270    ptrdiff_t stride3 = stride2 + stride;
271    __m256i in0, in1, in2, in3;
272    __m256i const_dc, temp0, temp1, reg0, reg1;
273
274    dc = (3  * dc + 1) >> 1;
275    dc = (17 * dc + 64) >> 7;
276    const_dc = __lasx_xvreplgr2vr_h(dc);
277
278    DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2,
279              0, dest + stride3, 0, in0, in1, in2, in3);
280    DUP2_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, temp0, temp1);
281    DUP2_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp0, temp1);
282    DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1);
283    temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0);
284    __lasx_xvstelm_d(temp0, dest, 0, 0);
285    __lasx_xvstelm_d(temp0, dest + stride, 0, 2);
286    __lasx_xvstelm_d(temp0, dest + stride2, 0, 1);
287    __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
288}
289
290void ff_vc1_inv_trans_4x8_dc_lasx(uint8_t *dest, ptrdiff_t stride,
291                                  int16_t *block)
292{
293    int dc = block[0];
294    ptrdiff_t stride2 = stride << 1;
295    ptrdiff_t stride3 = stride2 + stride;
296    uint8_t *dst = dest + (stride2 << 1);
297    __m256i in0, in1, in2, in3, in4, in5, in6, in7;
298    __m256i const_dc, temp0, temp1, temp2, temp3, reg0, reg1;
299
300    dc = (17 * dc +  4) >> 3;
301    dc = (12 * dc + 64) >> 7;
302    const_dc = __lasx_xvreplgr2vr_h(dc);
303
304    DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest + stride, 0, dest + stride2,
305              0, dest + stride3, 0, in0, in1, in2, in3);
306    DUP4_ARG2(__lasx_xvldrepl_w, dst, 0, dst + stride, 0, dst + stride2,
307              0, dst + stride3, 0, in4, in5, in6, in7);
308
309    DUP4_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, in5, in4, in7, in6,
310              temp0, temp1, temp2, temp3);
311    DUP2_ARG2(__lasx_xvilvl_d, temp1, temp0, temp3, temp2, reg0, reg1);
312    DUP2_ARG1(__lasx_vext2xv_hu_bu, reg0, reg1, temp0, temp1);
313    DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1);
314    temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0);
315    __lasx_xvstelm_w(temp0, dest, 0, 0);
316    __lasx_xvstelm_w(temp0, dest + stride, 0, 1);
317    __lasx_xvstelm_w(temp0, dest + stride2, 0, 4);
318    __lasx_xvstelm_w(temp0, dest + stride3, 0, 5);
319    __lasx_xvstelm_w(temp0, dst, 0, 2);
320    __lasx_xvstelm_w(temp0, dst + stride, 0, 3);
321    __lasx_xvstelm_w(temp0, dst + stride2, 0, 6);
322    __lasx_xvstelm_w(temp0, dst + stride3, 0, 7);
323}
324
325void ff_vc1_inv_trans_4x8_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block)
326{
327    ptrdiff_t stride2 = stride << 1;
328    ptrdiff_t stride3 = stride2 + stride;
329    uint8_t *dst = dest + (stride2 << 1);
330    __m256i in0, in1, in2, in3;
331    __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4;
332
333    __m256i const_1  = {0x0011001100110011, 0x0011001100110011,
334                        0x0011001100110011, 0x0011001100110011};
335    __m256i const_2  = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011,
336                        0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011};
337    __m256i const_3  = {0x000A0016000A0016, 0x000A0016000A0016,
338                        0x000A0016000A0016, 0x000A0016000A0016};
339    __m256i const_4  = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6,
340                        0x0016FFF60016FFF6, 0x0016FFF60016FFF6};
341    __m256i const_5  = {0x0000000400000004, 0x0000000400000004,
342                        0x0000000400000004, 0x0000000400000004};
343    __m256i const_6  = {0x0000004000000040, 0x0000004000000040,
344                        0x0000004000000040, 0x0000004000000040};
345    __m256i const_7  = {0x000C000C000C000C, 0X000C000C000C000C,
346                        0xFFF4000CFFF4000C, 0xFFF4000CFFF4000C};
347    __m256i const_8  = {0x0006001000060010, 0x0006001000060010,
348                        0xFFF00006FFF00006, 0xFFF00006FFF00006};
349    __m256i const_9  = {0x0009001000090010, 0x0009001000090010,
350                        0x0004000F0004000F, 0x0004000F0004000F};
351    __m256i const_10 = {0xFFF0000FFFF0000F, 0xFFF0000FFFF0000F,
352                        0xFFF7FFFCFFF7FFFC, 0xFFF7FFFCFFF7FFFC};
353    __m256i const_11 = {0x0004000900040009, 0x0004000900040009,
354                        0x000FFFF0000FFFF0, 0x000FFFF0000FFFF0};
355    __m256i const_12 = {0x000F0004000F0004, 0x000F0004000F0004,
356                        0xFFF0FFF7FFF0FFF7, 0xFFF0FFF7FFF0FFF7};
357    __m256i shift    = {0x0000000400000000, 0x0000000600000002,
358                        0x0000000500000001, 0x0000000700000003};
359
360    /* first loops */
361    DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
362              in0, in1, in2, in3);
363    in0   = __lasx_xvilvl_d(in1, in0);
364    in1   = __lasx_xvilvl_d(in3, in2);
365    temp0 = __lasx_xvpickev_h(in1, in0);
366    temp1 = __lasx_xvpickod_h(in1, in0);
367    temp0 = __lasx_xvperm_w(temp0, shift);
368    temp1 = __lasx_xvperm_w(temp1, shift);
369
370    DUP2_ARG3(__lasx_xvdp2add_w_h, const_5, temp0, const_1, const_5, temp0,
371              const_2, t1, t2);
372    DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
373
374    temp0 = __lasx_xvadd_w(t1, t3);
375    temp1 = __lasx_xvsub_w(t2, t4);
376    temp2 = __lasx_xvadd_w(t2, t4);
377    temp3 = __lasx_xvsub_w(t1, t3);
378    DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3,
379              temp0, temp1, temp2, temp3);
380
381    /* second loops */
382    t1    = __lasx_xvpickev_w(temp1, temp0);
383    t2    = __lasx_xvpickev_w(temp3, temp2);
384    t1    = __lasx_xvpickev_h(t2, t1);
385    t3    = __lasx_xvpickod_w(temp1, temp0);
386    t4    = __lasx_xvpickod_w(temp3, temp2);
387    temp1 = __lasx_xvpickev_h(t4, t3);
388    temp2 = __lasx_xvpermi_q(t1, t1, 0x00);
389    temp3 = __lasx_xvpermi_q(t1, t1, 0x11);
390    t1 = __lasx_xvdp2add_w_h(const_6, temp2, const_7);
391    t2 = __lasx_xvdp2_w_h(temp3, const_8);
392    t3    = __lasx_xvadd_w(t1, t2);
393    t4    = __lasx_xvsub_w(t1, t2);
394    t4    = __lasx_xvpermi_d(t4, 0x4E);
395
396    DUP4_ARG2(__lasx_xvdp2_w_h, temp1, const_9, temp1, const_10, temp1,
397              const_11, temp1, const_12, t1, t2, temp2, temp3);
398
399    temp0 = __lasx_xvpermi_q(t2, t1, 0x20);
400    temp1 = __lasx_xvpermi_q(t2, t1, 0x31);
401    t1    = __lasx_xvadd_w(temp0, temp1);
402    temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);
403    temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);
404    t2    = __lasx_xvadd_w(temp1, temp0);
405    temp0 = __lasx_xvadd_w(t1, t3);
406    temp1 = __lasx_xvadd_w(t2, t4);
407    temp2 = __lasx_xvsub_w(t4, t2);
408    temp3 = __lasx_xvsub_w(t3, t1);
409    temp2 = __lasx_xvaddi_wu(temp2, 1);
410    temp3 = __lasx_xvaddi_wu(temp3, 1);
411    DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7,
412              temp0, temp1, temp2, temp3);
413
414    DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest + stride, 0, dest + stride2, 0,
415              dest + stride3, 0, const_1, const_2, const_3, const_4);
416    DUP4_ARG2(__lasx_xvldrepl_w, dst, 0, dst + stride, 0, dst + stride2, 0,
417              dst + stride3, 0, const_5, const_6, const_7, const_8);
418
419    DUP4_ARG2(__lasx_xvilvl_w, const_2, const_1, const_4, const_3, const_5,
420              const_6, const_7, const_8, const_1, const_2, const_3, const_4);
421    DUP4_ARG1(__lasx_vext2xv_wu_bu, const_1, const_2, const_3, const_4,
422              const_1, const_2, const_3, const_4);
423    DUP4_ARG2(__lasx_xvadd_w, temp0, const_1, temp1, const_2, temp2, const_3,
424              temp3, const_4, temp0, temp1, temp2, temp3);
425    DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3,
426              temp0, temp1, temp2, temp3);
427    DUP2_ARG2(__lasx_xvpickev_h, temp1, temp0, temp3, temp2, temp0, temp1);
428    temp0   = __lasx_xvpickev_b(temp1, temp0);
429    __lasx_xvstelm_w(temp0, dest, 0, 0);
430    __lasx_xvstelm_w(temp0, dest + stride, 0, 4);
431    __lasx_xvstelm_w(temp0, dest + stride2, 0, 1);
432    __lasx_xvstelm_w(temp0, dest + stride3, 0, 5);
433    __lasx_xvstelm_w(temp0, dst, 0, 6);
434    __lasx_xvstelm_w(temp0, dst + stride, 0, 2);
435    __lasx_xvstelm_w(temp0, dst + stride2, 0, 7);
436    __lasx_xvstelm_w(temp0, dst + stride3, 0, 3);
437}
438
439void ff_vc1_inv_trans_4x4_dc_lasx(uint8_t *dest, ptrdiff_t stride,
440                                  int16_t *block)
441{
442    int dc = block[0];
443    uint8_t *dst1 = dest + stride;
444    uint8_t *dst2 = dst1 + stride;
445    uint8_t *dst3 = dst2 + stride;
446    __m256i in0, in1, in2, in3, temp0, temp1, const_dc;
447    __m256i zero  = {0};
448
449    dc = (17 * dc +  4) >> 3;
450    dc = (17 * dc + 64) >> 7;
451    const_dc = __lasx_xvreplgr2vr_h(dc);
452
453    DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0,
454              in0, in1, in2, in3);
455    DUP2_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, temp0, temp1);
456    in0   = __lasx_xvpermi_q(temp1, temp0, 0x20);
457    temp0 = __lasx_xvilvl_b(zero, in0);
458    in0   = __lasx_xvadd_h(temp0, const_dc);
459    temp0 = __lasx_xvssrarni_bu_h(in0, in0, 0);
460    __lasx_xvstelm_w(temp0, dest, 0, 0);
461    __lasx_xvstelm_w(temp0, dst1, 0, 1);
462    __lasx_xvstelm_w(temp0, dst2, 0, 4);
463    __lasx_xvstelm_w(temp0, dst3, 0, 5);
464}
465
466void ff_vc1_inv_trans_4x4_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block)
467{
468    uint8_t *dst1 = dest + stride;
469    uint8_t *dst2 = dst1 + stride;
470    uint8_t *dst3 = dst2 + stride;
471    __m256i in0, in1, in2, in3;
472    __m256i temp0, temp1, temp2, temp3, t1, t2;
473
474    __m256i const_1  = {0x0011001100110011, 0xFFEF0011FFEF0011,
475                        0x0011001100110011, 0xFFEF0011FFEF0011};
476    __m256i const_2  = {0x000A0016000A0016, 0x0016FFF60016FFF6,
477                        0x000A0016000A0016, 0x0016FFF60016FFF6};
478    __m256i const_64 = {0x0000004000000040, 0x0000004000000040,
479                        0x0000004000000040, 0x0000004000000040};
480
481    DUP2_ARG2(__lasx_xvld, block, 0, block, 32, in0, in1);
482    /* first loops */
483    temp0 = __lasx_xvilvl_d(in1, in0);
484    temp1 = __lasx_xvpickev_h(temp0, temp0);
485    temp2 = __lasx_xvpickod_h(temp0, temp0);
486    DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_1, temp2, const_2, t1, t2);
487    t1    = __lasx_xvaddi_wu(t1, 4);
488    in0   = __lasx_xvadd_w(t1, t2);
489    in1   = __lasx_xvsub_w(t1, t2);
490    DUP2_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in0, in1);
491    /* second loops */
492    temp0   = __lasx_xvpickev_h(in1, in0);
493    temp1   = __lasx_xvpermi_q(temp0, temp0, 0x00);
494    temp2   = __lasx_xvpermi_q(temp0, temp0, 0x11);
495    const_1 = __lasx_xvpermi_d(const_1, 0xD8);
496    const_2 = __lasx_xvpermi_d(const_2, 0xD8);
497    t1 = __lasx_xvdp2add_w_h(const_64, temp1, const_1);
498    t2 = __lasx_xvdp2_w_h(temp2, const_2);
499    in0     = __lasx_xvadd_w(t1, t2);
500    in1     = __lasx_xvsub_w(t1, t2);
501    DUP2_ARG2(__lasx_xvsrai_w, in0, 7, in1, 7, in0, in1);
502    temp0   = __lasx_xvshuf4i_w(in0, 0x9C);
503    temp1   = __lasx_xvshuf4i_w(in1, 0x9C);
504
505    DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0,
506              in0, in1, in2, in3);
507    temp2   = __lasx_xvilvl_w(in2, in0);
508    temp2   = __lasx_vext2xv_wu_bu(temp2);
509    temp3   = __lasx_xvilvl_w(in1, in3);
510    temp3   = __lasx_vext2xv_wu_bu(temp3);
511    temp0   = __lasx_xvadd_w(temp0, temp2);
512    temp1   = __lasx_xvadd_w(temp1, temp3);
513    DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1);
514    temp1   = __lasx_xvpickev_h(temp1, temp0);
515    temp0   = __lasx_xvpickev_b(temp1, temp1);
516    __lasx_xvstelm_w(temp0, dest, 0, 0);
517    __lasx_xvstelm_w(temp0, dst1, 0, 5);
518    __lasx_xvstelm_w(temp0, dst2, 0, 4);
519    __lasx_xvstelm_w(temp0, dst3, 0, 1);
520}
521
522static void put_vc1_mspel_mc_h_v_lasx(uint8_t *dst, const uint8_t *src,
523                                      ptrdiff_t stride, int hmode, int vmode,
524                                      int rnd)
525{
526    __m256i in0, in1, in2, in3;
527    __m256i t0, t1, t2, t3, t4, t5, t6, t7;
528    __m256i temp0, temp1, const_para1_2, const_para0_3;
529    __m256i const_r, const_sh;
530    __m256i sh = {0x0000000400000000, 0x0000000500000001,
531                  0x0000000600000002, 0x0000000700000003};
532    static const uint8_t para_value[][4] = {{4, 3, 53, 18},
533                                            {1, 1, 9, 9},
534                                            {3, 4, 18, 53}};
535    static const int shift_value[] = {0, 5, 1, 5};
536    int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;
537    int r     = (1 << (shift - 1)) + rnd - 1;
538    const uint8_t *para_v = para_value[vmode - 1];
539    ptrdiff_t stride2 = stride << 1;
540    ptrdiff_t stride4 = stride << 2;
541    ptrdiff_t stride3 = stride2 + stride;
542
543    const_r  = __lasx_xvreplgr2vr_h(r);
544    const_sh = __lasx_xvreplgr2vr_h(shift);
545    src -= 1, src -= stride;
546    const_para0_3 = __lasx_xvldrepl_h(para_v, 0);
547    const_para1_2 = __lasx_xvldrepl_h(para_v, 2);
548    DUP4_ARG2(__lasx_xvld, src, 0, src + stride, 0, src + stride2, 0,
549              src + stride3, 0, in0, in1, in2, in3);
550    DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
551              in0, in1, in2, in3);
552    DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
553    t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
554    t0 = __lasx_xvdp2sub_h_bu(t0, temp1, const_para0_3);
555    src  += stride4;
556    in0   = __lasx_xvld(src, 0);
557    in0   = __lasx_xvpermi_d(in0, 0xD8);
558    DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1);
559    t1 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
560    t1 = __lasx_xvdp2sub_h_bu(t1, temp1, const_para0_3);
561    src  += stride;
562    in1   = __lasx_xvld(src, 0);
563    in1   = __lasx_xvpermi_d(in1, 0xD8);
564    DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1);
565    t2 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
566    t2 = __lasx_xvdp2sub_h_bu(t2, temp1, const_para0_3);
567    src  += stride;
568    in2   = __lasx_xvld(src, 0);
569    in2   = __lasx_xvpermi_d(in2, 0xD8);
570    DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1);
571    t3 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
572    t3 = __lasx_xvdp2sub_h_bu(t3, temp1, const_para0_3);
573    src  += stride;
574    in3   = __lasx_xvld(src, 0);
575    in3   = __lasx_xvpermi_d(in3, 0xD8);
576    DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
577    t4 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
578    t4 = __lasx_xvdp2sub_h_bu(t4, temp1, const_para0_3);
579    src  += stride;
580    in0   = __lasx_xvld(src, 0);
581    in0   = __lasx_xvpermi_d(in0, 0xD8);
582    DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1);
583    t5 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
584    t5 = __lasx_xvdp2sub_h_bu(t5, temp1, const_para0_3);
585    src  += stride;
586    in1   = __lasx_xvld(src, 0);
587    in1   = __lasx_xvpermi_d(in1, 0xD8);
588    DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1);
589    t6 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
590    t6 = __lasx_xvdp2sub_h_bu(t6, temp1, const_para0_3);
591    src  += stride;
592    in2   = __lasx_xvld(src, 0);
593    in2   = __lasx_xvpermi_d(in2, 0xD8);
594    DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1);
595    t7 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
596    t7 = __lasx_xvdp2sub_h_bu(t7, temp1, const_para0_3);
597    DUP4_ARG2(__lasx_xvadd_h, t0, const_r, t1, const_r, t2, const_r, t3,
598              const_r, t0, t1, t2, t3);
599    DUP4_ARG2(__lasx_xvadd_h, t4, const_r, t5, const_r, t6, const_r, t7,
600              const_r, t4, t5, t6, t7);
601    DUP4_ARG2(__lasx_xvsra_h, t0, const_sh, t1, const_sh, t2, const_sh,
602              t3, const_sh, t0, t1, t2, t3);
603    DUP4_ARG2(__lasx_xvsra_h, t4, const_sh, t5, const_sh, t6, const_sh,
604              t7, const_sh, t4, t5, t6, t7);
605    LASX_TRANSPOSE8x8_H(t0, t1, t2, t3, t4, t5, t6, t7, t0,
606                        t1, t2, t3, t4, t5, t6, t7);
607    para_v  = para_value[hmode - 1];
608    const_para0_3 = __lasx_xvldrepl_h(para_v, 0);
609    const_para1_2 = __lasx_xvldrepl_h(para_v, 2);
610    const_para0_3 = __lasx_vext2xv_h_b(const_para0_3);
611    const_para1_2 = __lasx_vext2xv_h_b(const_para1_2);
612    r       = 64 - rnd;
613    const_r = __lasx_xvreplgr2vr_w(r);
614    DUP4_ARG2(__lasx_xvpermi_d, t0, 0x72, t1, 0x72, t2, 0x72, t0, 0xD8,
615              in0, in1, in2, t0);
616    DUP4_ARG2(__lasx_xvpermi_d, t1, 0xD8, t2, 0xD8, t3, 0xD8, t4, 0xD8,
617              t1, t2, t3, t4);
618    DUP2_ARG2(__lasx_xvpermi_d, t5, 0xD8, t6, 0xD8, t5, t6);
619    t7      = __lasx_xvpermi_d(t7, 0xD8);
620    DUP2_ARG2(__lasx_xvilvl_h, t2, t1, t3, t0, temp0, temp1);
621    t0 = __lasx_xvdp2_w_h(temp0, const_para1_2);
622    t0 = __lasx_xvdp2sub_w_h(t0, temp1, const_para0_3);
623    DUP2_ARG2(__lasx_xvilvl_h, t3, t2, t4, t1, temp0, temp1);
624    t1 = __lasx_xvdp2_w_h(temp0, const_para1_2);
625    t1 = __lasx_xvdp2sub_w_h(t1, temp1, const_para0_3);
626    DUP2_ARG2(__lasx_xvilvl_h, t4, t3, t5, t2, temp0, temp1);
627    t2 = __lasx_xvdp2_w_h(temp0, const_para1_2);
628    t2 = __lasx_xvdp2sub_w_h(t2, temp1, const_para0_3);
629    DUP2_ARG2(__lasx_xvilvl_h, t5, t4, t6, t3, temp0, temp1);
630    t3 = __lasx_xvdp2_w_h(temp0, const_para1_2);
631    t3 = __lasx_xvdp2sub_w_h(t3, temp1, const_para0_3);
632    DUP2_ARG2(__lasx_xvilvl_h, t6, t5, t7, t4, temp0, temp1);
633    t4 = __lasx_xvdp2_w_h(temp0, const_para1_2);
634    t4 = __lasx_xvdp2sub_w_h(t4, temp1, const_para0_3);
635    DUP2_ARG2(__lasx_xvilvl_h, t7, t6, in0, t5, temp0, temp1);
636    t5 = __lasx_xvdp2_w_h(temp0, const_para1_2);
637    t5 = __lasx_xvdp2sub_w_h(t5, temp1, const_para0_3);
638    DUP2_ARG2(__lasx_xvilvl_h, in0, t7, in1, t6, temp0, temp1);
639    t6 = __lasx_xvdp2_w_h(temp0, const_para1_2);
640    t6 = __lasx_xvdp2sub_w_h(t6, temp1, const_para0_3);
641    DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in2, t7, temp0, temp1);
642    t7 = __lasx_xvdp2_w_h(temp0, const_para1_2);
643    t7 = __lasx_xvdp2sub_w_h(t7, temp1, const_para0_3);
644    DUP4_ARG2(__lasx_xvadd_w, t0, const_r, t1, const_r, t2, const_r,
645              t3, const_r, t0, t1, t2, t3);
646    DUP4_ARG2(__lasx_xvadd_w, t4, const_r, t5, const_r, t6, const_r,
647              t7, const_r, t4, t5, t6, t7);
648    DUP4_ARG2(__lasx_xvsrai_w, t0, 7, t1, 7, t2, 7, t3, 7, t0, t1, t2, t3);
649    DUP4_ARG2(__lasx_xvsrai_w, t4, 7, t5, 7, t6, 7, t7, 7, t4, t5, t6, t7);
650    LASX_TRANSPOSE8x8_W(t0, t1, t2, t3, t4, t5, t6, t7,
651                        t0, t1, t2, t3, t4, t5, t6, t7);
652    DUP4_ARG1(__lasx_xvclip255_w, t0, t1, t2, t3, t0, t1, t2, t3);
653    DUP4_ARG1(__lasx_xvclip255_w, t4, t5, t6, t7, t4, t5, t6, t7);
654    DUP4_ARG2(__lasx_xvpickev_h, t1, t0, t3, t2, t5, t4, t7, t6,
655              t0, t1, t2, t3);
656    DUP2_ARG2(__lasx_xvpickev_b, t1, t0, t3, t2, t0, t1);
657    t0 = __lasx_xvperm_w(t0, sh);
658    t1 = __lasx_xvperm_w(t1, sh);
659    __lasx_xvstelm_d(t0, dst, 0, 0);
660    __lasx_xvstelm_d(t0, dst + stride, 0, 1);
661    __lasx_xvstelm_d(t0, dst + stride2, 0, 2);
662    __lasx_xvstelm_d(t0, dst + stride3, 0, 3);
663    dst += stride4;
664    __lasx_xvstelm_d(t1, dst, 0, 0);
665    __lasx_xvstelm_d(t1, dst + stride, 0, 1);
666    __lasx_xvstelm_d(t1, dst + stride2, 0, 2);
667    __lasx_xvstelm_d(t1, dst + stride3, 0, 3);
668}
669
670#define PUT_VC1_MSPEL_MC_LASX(hmode, vmode)                                   \
671void ff_put_vc1_mspel_mc ## hmode ## vmode ## _lasx(uint8_t *dst,             \
672                                                const uint8_t *src,           \
673                                                ptrdiff_t stride, int rnd)    \
674{                                                                             \
675    put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \
676}                                                                             \
677void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_lasx(uint8_t *dst,          \
678                                                   const uint8_t *src,        \
679                                                   ptrdiff_t stride, int rnd) \
680{                                                                             \
681    put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \
682    put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd);   \
683    dst += 8 * stride, src += 8 * stride;                                     \
684    put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \
685    put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd);   \
686}
687
688PUT_VC1_MSPEL_MC_LASX(1, 1);
689PUT_VC1_MSPEL_MC_LASX(1, 2);
690PUT_VC1_MSPEL_MC_LASX(1, 3);
691
692PUT_VC1_MSPEL_MC_LASX(2, 1);
693PUT_VC1_MSPEL_MC_LASX(2, 2);
694PUT_VC1_MSPEL_MC_LASX(2, 3);
695
696PUT_VC1_MSPEL_MC_LASX(3, 1);
697PUT_VC1_MSPEL_MC_LASX(3, 2);
698PUT_VC1_MSPEL_MC_LASX(3, 3);
699
700void ff_put_no_rnd_vc1_chroma_mc8_lasx(uint8_t *dst /* align 8 */,
701                                       uint8_t *src /* align 1 */,
702                                       ptrdiff_t stride, int h, int x, int y)
703{
704    const int intA = (8 - x) * (8 - y);
705    const int intB =     (x) * (8 - y);
706    const int intC = (8 - x) *     (y);
707    const int intD =     (x) *     (y);
708    __m256i src00, src01, src10, src11;
709    __m256i A, B, C, D;
710    int i;
711
712    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
713
714    A = __lasx_xvreplgr2vr_h(intA);
715    B = __lasx_xvreplgr2vr_h(intB);
716    C = __lasx_xvreplgr2vr_h(intC);
717    D = __lasx_xvreplgr2vr_h(intD);
718    for(i = 0; i < h; i++){
719        DUP2_ARG2(__lasx_xvld, src, 0, src, 1, src00, src01);
720        src += stride;
721        DUP2_ARG2(__lasx_xvld, src, 0, src, 1, src10, src11);
722
723        DUP4_ARG1(__lasx_vext2xv_hu_bu, src00, src01, src10, src11,
724                  src00, src01, src10, src11);
725        DUP4_ARG2(__lasx_xvmul_h, src00, A, src01, B, src10, C, src11, D,
726                  src00, src01, src10, src11);
727        src00 = __lasx_xvadd_h(src00, src01);
728        src10 = __lasx_xvadd_h(src10, src11);
729        src00 = __lasx_xvadd_h(src00, src10);
730        src00 = __lasx_xvaddi_hu(src00, 28);
731        src00 = __lasx_xvsrli_h(src00, 6);
732        src00 = __lasx_xvpickev_b(src00, src00);
733        __lasx_xvstelm_d(src00, dst, 0, 0);
734        dst += stride;
735    }
736}
737
738static void put_vc1_mspel_mc_v_lasx(uint8_t *dst, const uint8_t *src,
739                                    ptrdiff_t stride, int vmode, int rnd)
740{
741    __m256i in0, in1, in2, in3, temp0, temp1, t0;
742    __m256i const_para0_3, const_para1_2, const_r, const_sh;
743    static const uint16_t para_value[][2] = {{0x0304, 0x1235},
744                                            {0x0101, 0x0909},
745                                            {0x0403, 0x3512}};
746    const uint16_t *para_v = para_value[vmode - 1];
747    static const int shift_value[] = {0, 6, 4, 6};
748    static int add_value[3];
749    ptrdiff_t stride_2x = stride << 1;
750    int i = 0;
751    add_value[2] = add_value[0] = 31 + rnd, add_value[1] = 7 + rnd;
752
753    const_r  = __lasx_xvreplgr2vr_h(add_value[vmode - 1]);
754    const_sh = __lasx_xvreplgr2vr_h(shift_value[vmode]);
755    const_para0_3 = __lasx_xvreplgr2vr_h(*para_v);
756    const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1));
757
758    DUP2_ARG2(__lasx_xvld, src - stride, 0, src, 0, in0, in1);
759    in2 = __lasx_xvld(src + stride, 0);
760    in0   = __lasx_xvpermi_d(in0, 0xD8);
761    in1   = __lasx_xvpermi_d(in1, 0xD8);
762    in2   = __lasx_xvpermi_d(in2, 0xD8);
763    for (; i < 16; i++) {
764        in3 = __lasx_xvld(src + stride_2x, 0);
765        in3 = __lasx_xvpermi_d(in3, 0xD8);
766        DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
767        t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
768        t0 = __lasx_xvdp2sub_h_bu(t0, temp1, const_para0_3);
769        t0 = __lasx_xvadd_h(t0, const_r);
770        t0 = __lasx_xvsra_h(t0, const_sh);
771        t0 = __lasx_xvclip255_h(t0);
772        t0 = __lasx_xvpickev_b(t0, t0);
773        __lasx_xvstelm_d(t0, dst, 0, 0);
774        __lasx_xvstelm_d(t0, dst, 8, 2);
775        dst += stride;
776        src += stride;
777        in0 = in1;
778        in1 = in2;
779        in2 = in3;
780    }
781}
782
783#define PUT_VC1_MSPEL_MC_V_LASX(vmode)                                    \
784void ff_put_vc1_mspel_mc0 ## vmode ## _16_lasx(uint8_t *dst,              \
785                                               const uint8_t *src,        \
786                                               ptrdiff_t stride, int rnd) \
787{                                                                         \
788    put_vc1_mspel_mc_v_lasx(dst, src, stride, vmode, rnd);                \
789}
790
791PUT_VC1_MSPEL_MC_V_LASX(1);
792PUT_VC1_MSPEL_MC_V_LASX(2);
793PUT_VC1_MSPEL_MC_V_LASX(3);
794
795#define ROW_LASX(in0, in1, in2, in3, out0)                                \
796    DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, tmp0_m, tmp1_m);       \
797    out0 = __lasx_xvdp2_h_bu(tmp0_m, const_para1_2);                      \
798    out0 = __lasx_xvdp2sub_h_bu(out0, tmp1_m, const_para0_3);             \
799    out0 = __lasx_xvadd_h(out0, const_r);                                 \
800    out0 = __lasx_xvsra_h(out0, const_sh);                                \
801    out0 = __lasx_xvclip255_h(out0);                                      \
802    out0 = __lasx_xvpickev_b(out0, out0);                                 \
803    out0 = __lasx_xvpermi_d(out0, 0xd8);                                  \
804
805static void put_vc1_mspel_mc_h_lasx(uint8_t *dst, const uint8_t *src,
806                                    ptrdiff_t stride, int hmode, int rnd)
807{
808    __m256i in0, in1, in2, in3, in4, in5, in6, in7,
809            in8, in9, in10, in11, in12, in13, in14, in15;
810    __m256i out0, out1, out2, out3, out4, out5, out6, out7, out8, out9,
811            out10, out11, out12, out13, out14, out15, out16, out17, out18;
812    __m256i const_para0_3, const_para1_2, const_r, const_sh;
813    __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m;
814    __m256i tmp4_m, tmp5_m, tmp6_m, tmp7_m;
815    __m256i t0, t1, t2, t3, t4, t5, t6, t7;
816    ptrdiff_t stride2 = stride << 1;
817    ptrdiff_t stride4 = stride << 2;
818    ptrdiff_t stride3 = stride2 + stride;
819    static const uint16_t para_value[][2] = {{0x0304, 0x1235},
820                                            {0x0101, 0x0909},
821                                            {0x0403, 0x3512}};
822    const uint16_t *para_v = para_value[hmode - 1];
823    static const int shift_value[] = {0, 6, 4, 6};
824    static int add_value[3];
825    uint8_t *_src = (uint8_t*)src - 1;
826    add_value[2] = add_value[0] = 32 - rnd, add_value[1] = 8 - rnd;
827
828    const_r  = __lasx_xvreplgr2vr_h(add_value[hmode - 1]);
829    const_sh = __lasx_xvreplgr2vr_h(shift_value[hmode]);
830    const_para0_3 = __lasx_xvreplgr2vr_h(*para_v);
831    const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1));
832
833    in0 = __lasx_xvld(_src, 0);
834    DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in1, in2);
835    in3 = __lasx_xvldx(_src, stride3);
836    _src += stride4;
837    in4 = __lasx_xvld(_src, 0);
838    DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in5, in6);
839    in7 = __lasx_xvldx(_src, stride3);
840    _src += stride4;
841    in8 = __lasx_xvld(_src, 0);
842    DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in9, in10);
843    in11 = __lasx_xvldx(_src, stride3);
844    _src += stride4;
845    in12 = __lasx_xvld(_src, 0);
846    DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in13, in14);
847    in15 = __lasx_xvldx(_src, stride3);
848    DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5,
849              tmp0_m, tmp1_m, tmp2_m, tmp3_m);
850    DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13,
851              tmp4_m, tmp5_m, tmp6_m, tmp7_m);
852    DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
853              tmp7_m, tmp6_m, t0, t2, t4, t6);
854    DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
855              tmp7_m, tmp6_m, t1, t3, t5, t7);
856    DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
857              tmp1_m, tmp5_m);
858    DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
859              tmp3_m, tmp7_m);
860    DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
861              tmp7_m, tmp6_m, out0, out2, out4, out6);
862    DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
863              tmp7_m, tmp6_m, out1, out3, out5, out7);
864
865    DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5,
866              tmp0_m, tmp1_m, tmp2_m, tmp3_m);
867    DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13,
868              tmp4_m, tmp5_m, tmp6_m, tmp7_m);
869    DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
870              tmp7_m, tmp6_m, t0, t2, t4, t6);
871    DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
872              tmp7_m, tmp6_m, t1, t3, t5, t7);
873    DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
874              tmp1_m, tmp5_m);
875    DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
876              tmp3_m, tmp7_m);
877    DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
878              tmp7_m, tmp6_m, out8, out10, out12, out14);
879    DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
880              tmp7_m, tmp6_m, out9, out11, out13, out15);
881    DUP2_ARG3(__lasx_xvpermi_q, out0, out0, 0x31, out1, out1, 0x31, out16, out17);
882    out18 = __lasx_xvpermi_q(out2, out2, 0x31);
883
884    DUP4_ARG2(__lasx_xvpermi_d, out0, 0xD8, out1, 0xD8, out2, 0xD8, out3, 0xD8,
885              out0, out1, out2, out3);
886    DUP4_ARG2(__lasx_xvpermi_d, out4, 0xD8, out5, 0xD8, out6, 0xD8, out7, 0xD8,
887              out4, out5, out6, out7);
888    DUP4_ARG2(__lasx_xvpermi_d, out8, 0xD8, out9, 0xD8, out10, 0xD8, out11,
889              0xD8, out8, out9, out10, out11);
890    DUP4_ARG2(__lasx_xvpermi_d, out12, 0xD8, out13, 0xD8, out14, 0xD8, out15,
891              0xD8, out12, out13, out14, out15);
892    out16 = __lasx_xvpermi_d(out16, 0xD8);
893    out17 = __lasx_xvpermi_d(out17, 0xD8);
894    out18 = __lasx_xvpermi_d(out18, 0xD8);
895
896    ROW_LASX(out0,  out1,  out2,  out3,  in0);
897    ROW_LASX(out1,  out2,  out3,  out4,  in1);
898    ROW_LASX(out2,  out3,  out4,  out5,  in2);
899    ROW_LASX(out3,  out4,  out5,  out6,  in3);
900    ROW_LASX(out4,  out5,  out6,  out7,  in4);
901    ROW_LASX(out5,  out6,  out7,  out8,  in5);
902    ROW_LASX(out6,  out7,  out8,  out9,  in6);
903    ROW_LASX(out7,  out8,  out9,  out10, in7);
904    ROW_LASX(out8,  out9,  out10, out11, in8);
905    ROW_LASX(out9,  out10, out11, out12, in9);
906    ROW_LASX(out10, out11, out12, out13, in10);
907    ROW_LASX(out11, out12, out13, out14, in11);
908    ROW_LASX(out12, out13, out14, out15, in12);
909    ROW_LASX(out13, out14, out15, out16, in13);
910    ROW_LASX(out14, out15, out16, out17, in14);
911    ROW_LASX(out15, out16, out17, out18, in15);
912
913    DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5,
914              tmp0_m, tmp1_m, tmp2_m, tmp3_m);
915    DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13,
916              tmp4_m, tmp5_m, tmp6_m, tmp7_m);
917    DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
918              tmp7_m, tmp6_m, t0, t2, t4, t6);
919    DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
920              tmp7_m, tmp6_m, t1, t3, t5, t7);
921    DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
922              tmp1_m, tmp5_m);
923    DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
924              tmp3_m, tmp7_m);
925    DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
926              tmp7_m, tmp6_m, out0, out2, out4, out6);
927    DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
928              tmp7_m, tmp6_m, out1, out3, out5, out7);
929
930    DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5,
931              tmp0_m, tmp1_m, tmp2_m, tmp3_m);
932    DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13,
933              tmp4_m, tmp5_m, tmp6_m, tmp7_m);
934    DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
935              tmp7_m, tmp6_m, t0, t2, t4, t6);
936    DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
937              tmp7_m, tmp6_m, t1, t3, t5, t7);
938    DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
939              tmp1_m, tmp5_m);
940    DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
941              tmp3_m, tmp7_m);
942    DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
943              tmp7_m, tmp6_m, out8, out10, out12, out14);
944    DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
945              tmp7_m, tmp6_m, out9, out11, out13, out15);
946    __lasx_xvstelm_d(out0, dst, 0, 0);
947    __lasx_xvstelm_d(out0, dst, 8, 1);
948    dst += stride;
949    __lasx_xvstelm_d(out1, dst, 0, 0);
950    __lasx_xvstelm_d(out1, dst, 8, 1);
951    dst += stride;
952    __lasx_xvstelm_d(out2, dst, 0, 0);
953    __lasx_xvstelm_d(out2, dst, 8, 1);
954    dst += stride;
955    __lasx_xvstelm_d(out3, dst, 0, 0);
956    __lasx_xvstelm_d(out3, dst, 8, 1);
957    dst += stride;
958    __lasx_xvstelm_d(out4, dst, 0, 0);
959    __lasx_xvstelm_d(out4, dst, 8, 1);
960    dst += stride;
961    __lasx_xvstelm_d(out5, dst, 0, 0);
962    __lasx_xvstelm_d(out5, dst, 8, 1);
963    dst += stride;
964    __lasx_xvstelm_d(out6, dst, 0, 0);
965    __lasx_xvstelm_d(out6, dst, 8, 1);
966    dst += stride;
967    __lasx_xvstelm_d(out7, dst, 0, 0);
968    __lasx_xvstelm_d(out7, dst, 8, 1);
969    dst += stride;
970    __lasx_xvstelm_d(out8, dst, 0, 0);
971    __lasx_xvstelm_d(out8, dst, 8, 1);
972    dst += stride;
973    __lasx_xvstelm_d(out9, dst, 0, 0);
974    __lasx_xvstelm_d(out9, dst, 8, 1);
975    dst += stride;
976    __lasx_xvstelm_d(out10, dst, 0, 0);
977    __lasx_xvstelm_d(out10, dst, 8, 1);
978    dst += stride;
979    __lasx_xvstelm_d(out11, dst, 0, 0);
980    __lasx_xvstelm_d(out11, dst, 8, 1);
981    dst += stride;
982    __lasx_xvstelm_d(out12, dst, 0, 0);
983    __lasx_xvstelm_d(out12, dst, 8, 1);
984    dst += stride;
985    __lasx_xvstelm_d(out13, dst, 0, 0);
986    __lasx_xvstelm_d(out13, dst, 8, 1);
987    dst += stride;
988    __lasx_xvstelm_d(out14, dst, 0, 0);
989    __lasx_xvstelm_d(out14, dst, 8, 1);
990    dst += stride;
991    __lasx_xvstelm_d(out15, dst, 0, 0);
992    __lasx_xvstelm_d(out15, dst, 8, 1);
993}
994
995#define PUT_VC1_MSPEL_MC_H_LASX(hmode)                                    \
996void ff_put_vc1_mspel_mc ## hmode ## 0_16_lasx(uint8_t *dst,              \
997                                               const uint8_t *src,        \
998                                               ptrdiff_t stride, int rnd) \
999{                                                                         \
1000    put_vc1_mspel_mc_h_lasx(dst, src, stride, hmode, rnd);                \
1001}
1002
1003PUT_VC1_MSPEL_MC_H_LASX(1);
1004PUT_VC1_MSPEL_MC_H_LASX(2);
1005PUT_VC1_MSPEL_MC_H_LASX(3);
1006