1/*
2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mips/generic_macros_msa.h"
22#include "idctdsp_mips.h"
23
24static void simple_idct_msa(int16_t *block)
25{
26    int32_t const_val;
27    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
28    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
29    v8i16 w1, w3, w5, w7;
30    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
31    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
32    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
33    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
34    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
35    v4i32 w2, w4, w6;
36    v8i16 select_vec, temp;
37    v8i16 zero = { 0 };
38    v4i32 const_val0 = __msa_ldi_w(1);
39    v4i32 const_val1 = __msa_ldi_w(1);
40
41    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
42    const_val0 <<= 10;
43    const_val = 16383 * ((1 << 19) / 16383);
44    const_val1 = __msa_insert_w(const_val0, 0, const_val);
45    const_val1 = __msa_splati_w(const_val1, 0);
46    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
47                       in0, in1, in2, in3, in4, in5, in6, in7);
48    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
49    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
50    UNPCK_SH_SW(in0, a0_r, a0_l);
51    UNPCK_SH_SW(in2, temp3_r, temp3_l);
52    temp = in0 << 3;
53    w2 = (v4i32) __msa_splati_h(weights, 2);
54    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
55    w4 = (v4i32) __msa_splati_h(weights, 4);
56    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
57    w6 = (v4i32) __msa_splati_h(weights, 6);
58    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
59    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
60    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
61    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
62         temp1_r, temp1_l, temp2_r, temp2_l);
63    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
64                temp2_l, temp2_r, temp1_l, temp1_r,
65                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
66    UNPCK_SH_SW(in4, temp0_r, temp0_l);
67    UNPCK_SH_SW(in6, temp3_r, temp3_l);
68    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
69    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
70         temp2_r, temp2_l, temp1_r, temp1_l);
71    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
72    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
73         a1_r, a1_l, a2_r, a2_l);
74    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
75         a3_r, a3_l, a0_r, a0_l);
76    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
77    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
78    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
79    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
80    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
81    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
82    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
83               const0, const1, const2, const3);
84    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
85    const5 = __msa_ilvod_h(-w1, -w5);
86    const7 = __msa_ilvod_h(w3, -w1);
87    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
88                b0_r, b1_r, b2_r, b3_r);
89    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
90                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
91    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
92                b0_l, b1_l, b2_l, b3_l);
93    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
94                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
95    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
96                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
97                 temp0_r, temp0_l, temp1_r, temp1_l,
98                 temp2_r, temp2_l, temp3_r, temp3_l,
99                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
100    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
101    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
102    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
103                temp2_l, temp2_r, temp3_l, temp3_r,
104                temp0_r, temp1_r, temp2_r, temp3_r);
105    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
106                               (v16u8) select_vec);
107    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
108                               (v16u8) select_vec);
109    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
110                               (v16u8) select_vec);
111    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
112                               (v16u8) select_vec);
113    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
114    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
115    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
116                a0_r, a1_r, a2_r, a3_r);
117    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
118    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
119    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
120    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
121    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
122                       in0, in1, in2, in3, in4, in5, in6, in7);
123
124    UNPCK_SH_SW(in0, a0_r, a0_l);
125    UNPCK_SH_SW(in2, temp3_r, temp3_l);
126    w2 = (v4i32) __msa_splati_h(weights, 2);
127    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
128    w4 = (v4i32) __msa_splati_h(weights, 4);
129    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
130    w6 = (v4i32) __msa_splati_h(weights, 6);
131    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
132    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
133    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
134    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
135         temp1_r, temp1_l, temp2_r, temp2_l);
136    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
137                temp2_l, temp2_r, temp1_l, temp1_r,
138                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
139    UNPCK_SH_SW(in4, temp0_r, temp0_l);
140    UNPCK_SH_SW(in6, temp3_r, temp3_l);
141    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
142    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
143         temp2_r, temp2_l, temp1_r, temp1_l);
144    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
145    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
146         a1_r, a1_l, a2_r, a2_l);
147    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
148         a3_r, a3_l, a0_r, a0_l);
149    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
150    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
151    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
152    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
153    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
154    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
155               const0, const1, const2, const3);
156    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
157                b0_r, b1_r, b2_r, b3_r);
158    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
159                b0_l, b1_l, b2_l, b3_l);
160    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
161    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
162    const5 = __msa_ilvod_h(-w1, -w5);
163    const7 = __msa_ilvod_h(w3, -w1);
164    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
165                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
166    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
167                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
168    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
169                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
170                 temp0_r, temp0_l, temp1_r, temp1_l,
171                 temp2_r, temp2_l, temp3_r, temp3_l,
172                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
173    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
174    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
175    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
176                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
177    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
178    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
179    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
180                a0_r, a1_r, a2_r, a3_r);
181    ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
182           block, 8);
183}
184
185static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
186                                int16_t *block)
187{
188    int32_t const_val;
189    uint64_t tmp0, tmp1, tmp2, tmp3;
190    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
191    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
192    v8i16 w1, w3, w5, w7;
193    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
194    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
195    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
196    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
197    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
198    v4i32 w2, w4, w6;
199    v8i16 select_vec, temp;
200    v8i16 zero = { 0 };
201    v4i32 const_val0 = __msa_ldi_w(1);
202    v4i32 const_val1 = __msa_ldi_w(1);
203
204    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
205    const_val0 <<= 10;
206    const_val = 16383 * ((1 << 19) / 16383);
207    const_val1 = __msa_insert_w(const_val0, 0, const_val);
208    const_val1 = __msa_splati_w(const_val1, 0);
209    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
210                       in0, in1, in2, in3, in4, in5, in6, in7);
211    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
212    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
213    UNPCK_SH_SW(in0, a0_r, a0_l);
214    UNPCK_SH_SW(in2, temp3_r, temp3_l);
215    temp = in0 << 3;
216    w2 = (v4i32) __msa_splati_h(weights, 2);
217    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
218    w4 = (v4i32) __msa_splati_h(weights, 4);
219    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
220    w6 = (v4i32) __msa_splati_h(weights, 6);
221    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
222    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
223    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
224    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
225    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
226    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
227                temp2_l, temp2_r, temp1_l, temp1_r,
228                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
229    UNPCK_SH_SW(in4, temp0_r, temp0_l);
230    UNPCK_SH_SW(in6, temp3_r, temp3_l);
231    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
232    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
233    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
234    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
235    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
236    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
237    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
238    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
239    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
240    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
241    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
242    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
243    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
244    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
245    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
246               const0, const1, const2, const3);
247    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
248    const5 = __msa_ilvod_h(-w1, -w5);
249    const7 = __msa_ilvod_h(w3, -w1);
250    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
251                b0_r, b1_r, b2_r, b3_r);
252    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
253                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
254    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
255                b0_l, b1_l, b2_l, b3_l);
256    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
257                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
258    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
259                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
260                 temp0_r, temp0_l, temp1_r, temp1_l,
261                 temp2_r, temp2_l, temp3_r, temp3_l,
262                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
263    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
264    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
265    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
266                temp2_l, temp2_r, temp3_l, temp3_r,
267                temp0_r, temp1_r, temp2_r, temp3_r);
268    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
269                               (v16u8) select_vec);
270    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
271                               (v16u8) select_vec);
272    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
273                               (v16u8) select_vec);
274    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
275                               (v16u8) select_vec);
276    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
277    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
278    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
279                a0_r, a1_r, a2_r, a3_r);
280    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
281    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
282    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
283    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
284    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
285                       in0, in1, in2, in3, in4, in5, in6, in7);
286    UNPCK_SH_SW(in0, a0_r, a0_l);
287    UNPCK_SH_SW(in2, temp3_r, temp3_l);
288    w2 = (v4i32) __msa_splati_h(weights, 2);
289    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
290    w4 = (v4i32) __msa_splati_h(weights, 4);
291    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
292    w6 = (v4i32) __msa_splati_h(weights, 6);
293    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
294    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
295    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
296    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
297    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
298    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
299                temp2_l, temp2_r, temp1_l, temp1_r,
300                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
301    UNPCK_SH_SW(in4, temp0_r, temp0_l);
302    UNPCK_SH_SW(in6, temp3_r, temp3_l);
303    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
304    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
305    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
306    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
307    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
308    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
309    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
310    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
311    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
312    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
313    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
314    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
315    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
316    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
317               const0, const1, const2, const3);
318    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
319                b0_r, b1_r, b2_r, b3_r);
320    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
321                b0_l, b1_l, b2_l, b3_l);
322    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
323    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
324    const5 = __msa_ilvod_h(-w1, -w5);
325    const7 = __msa_ilvod_h(w3, -w1);
326    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
327                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
328    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
329                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
330    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
331                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
332                 temp0_r, temp0_l, temp1_r, temp1_l,
333                 temp2_r, temp2_l, temp3_r, temp3_l,
334                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
335    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
336    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
337    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
338    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
339    PCKEV_H4_SH(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
340                temp3_l, temp3_r, in0, in1, in2, in3);
341    PCKEV_H4_SH(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
342                in4, in5, in6, in7);
343    CLIP_SH4_0_255(in0, in1, in2, in3);
344    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
345                in0, in1, in2, in3);
346    tmp0 = __msa_copy_u_d((v2i64) in0, 1);
347    tmp1 = __msa_copy_u_d((v2i64) in1, 1);
348    tmp2 = __msa_copy_u_d((v2i64) in2, 1);
349    tmp3 = __msa_copy_u_d((v2i64) in3, 1);
350    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
351    CLIP_SH4_0_255(in4, in5, in6, in7);
352    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
353                in4, in5, in6, in7);
354    tmp3 = __msa_copy_u_d((v2i64) in4, 1);
355    tmp2 = __msa_copy_u_d((v2i64) in5, 1);
356    tmp1 = __msa_copy_u_d((v2i64) in6, 1);
357    tmp0 = __msa_copy_u_d((v2i64) in7, 1);
358    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
359}
360
361static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
362                                int16_t *block)
363{
364    int32_t const_val;
365    uint64_t tmp0, tmp1, tmp2, tmp3;
366    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
367    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
368    v8i16 w1, w3, w5, w7;
369    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
370    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
371    v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
372    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
373    v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
374    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
375    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
376    v4i32 w2, w4, w6;
377    v8i16 select_vec, temp;
378    v8i16 zero = { 0 };
379    v4i32 const_val0 = __msa_ldi_w(1);
380    v4i32 const_val1 = __msa_ldi_w(1);
381
382    const_val0 <<= 10;
383    const_val = 16383 * ((1 << 19) / 16383);
384    const_val1 = __msa_insert_w(const_val0, 0, const_val);
385    const_val1 = __msa_splati_w(const_val1, 0);
386    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
387    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
388                       in0, in1, in2, in3, in4, in5, in6, in7);
389
390    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
391    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
392    UNPCK_SH_SW(in0, a0_r, a0_l);
393    UNPCK_SH_SW(in2, temp3_r, temp3_l);
394    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
395    UNPCK_SH_SW(in4, temp4_r, temp4_l);
396    UNPCK_SH_SW(in6, temp7_r, temp7_l);
397    ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
398    temp = in0 << 3;
399    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
400    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
401               const0, const1, const2, const3);
402    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
403    const5 = __msa_ilvod_h(-w1, -w5);
404    const7 = __msa_ilvod_h(w3, -w1);
405    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
406                b0_r, b1_r, b2_r, b3_r);
407    DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
408                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
409    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
410                b0_l, b1_l, b2_l, b3_l);
411    DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
412                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
413    w2 = (v4i32) __msa_splati_h(weights, 2);
414    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
415    w4 = (v4i32) __msa_splati_h(weights, 4);
416    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
417    w6 = (v4i32) __msa_splati_h(weights, 6);
418    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
419    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
420    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
421    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
422    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
423    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
424                temp2_l, temp2_r, temp1_l, temp1_r,
425                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
426    MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
427    MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
428    MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
429    ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
430    SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
431    SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
432    ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
433    ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
434    SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
435    ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
436    SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
437    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
438                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
439                 temp0_r, temp0_l, temp1_r, temp1_l,
440                 temp2_r, temp2_l, temp3_r, temp3_l,
441                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
442    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
443    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
444    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
445                temp2_l, temp2_r, temp3_l, temp3_r,
446                temp0_r, temp1_r, temp2_r, temp3_r);
447    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
448                               (v16u8) select_vec);
449    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
450                               (v16u8) select_vec);
451    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
452                               (v16u8) select_vec);
453    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
454                               (v16u8) select_vec);
455    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
456    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
457    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
458                a0_r, a1_r, a2_r, a3_r);
459    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
460    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
461    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
462    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
463    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
464                       in0, in1, in2, in3, in4, in5, in6, in7);
465
466    UNPCK_SH_SW(in0, a0_r, a0_l);
467    UNPCK_SH_SW(in2, temp3_r, temp3_l);
468    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
469    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
470    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
471    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
472    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
473                temp2_l, temp2_r, temp1_l, temp1_r,
474                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
475    UNPCK_SH_SW(in4, temp0_r, temp0_l);
476    UNPCK_SH_SW(in6, temp3_r, temp3_l);
477    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
478    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
479    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
480    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
481    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
482    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
483    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
484    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
485    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
486    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
487    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
488    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
489    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
490    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
491                b0_r, b1_r, b2_r, b3_r);
492    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
493                b0_l, b1_l, b2_l, b3_l);
494    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
495                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
496    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
497                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
498    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
499                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
500                 temp0_r, temp0_l, temp1_r, temp1_l,
501                 temp2_r, temp2_l, temp3_r, temp3_l,
502                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
503    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
504    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
505    LD_SH4(dst, dst_stride, in0, in1, in2, in3);
506    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
507                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
508    ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
509               temp0_l, temp1_l, temp2_l, temp3_l);
510    in0 = (v8i16) (temp0_r) + (v8i16) (temp0_l);
511    in1 = (v8i16) (temp1_r) + (v8i16) (temp1_l);
512    in2 = (v8i16) (temp2_r) + (v8i16) (temp2_l);
513    in3 = (v8i16) (temp3_r) + (v8i16) (temp3_l);
514    CLIP_SH4_0_255(in0, in1, in2, in3);
515    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3,
516                in0, in1, in2, in3);
517    tmp0 = __msa_copy_u_d((v2i64) in0, 1);
518    tmp1 = __msa_copy_u_d((v2i64) in1, 1);
519    tmp2 = __msa_copy_u_d((v2i64) in2, 1);
520    tmp3 = __msa_copy_u_d((v2i64) in3, 1);
521    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
522
523    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
524    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
525    LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
526    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
527                a0_r, a1_r, a2_r, a3_r);
528    ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
529               a3_l, a2_l, a1_l, a0_l);
530    in4 = (v8i16) (a3_r) + (v8i16) (a3_l);
531    in5 = (v8i16) (a2_r) + (v8i16) (a2_l);
532    in6 = (v8i16) (a1_r) + (v8i16) (a1_l);
533    in7 = (v8i16) (a0_r) + (v8i16) (a0_l);
534    CLIP_SH4_0_255(in4, in5, in6, in7);
535    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7,
536                in4, in5, in6, in7);
537    tmp0 = __msa_copy_u_d((v2i64) in4, 1);
538    tmp1 = __msa_copy_u_d((v2i64) in5, 1);
539    tmp2 = __msa_copy_u_d((v2i64) in6, 1);
540    tmp3 = __msa_copy_u_d((v2i64) in7, 1);
541    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
542}
543
544void ff_simple_idct_msa(int16_t *block)
545{
546    simple_idct_msa(block);
547}
548
549void ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
550{
551    simple_idct_put_msa(dst, dst_stride, block);
552}
553
554void ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
555{
556    simple_idct_add_msa(dst, dst_stride, block);
557}
558