1/*
2 * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3 *
4 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/attributes.h"
24#include "libavutil/avassert.h"
25#include "libavutil/mem_internal.h"
26
27#include "libavcodec/vc1dsp.h"
28#include "constants.h"
29#include "vc1dsp_mips.h"
30#include "hpeldsp_mips.h"
31#include "libavutil/mips/mmiutils.h"
32
33#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)                  \
34        "li         %[tmp0],    "#r1"                                 \n\t" \
35        "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
36        "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
37        "li         %[tmp0],    "#r2"                                 \n\t" \
38        "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
39        "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
40        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
41        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
42        "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
43        "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
44        "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
45        "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
46                                                                            \
47        "li         %[tmp0],    "#r3"                                 \n\t" \
48        "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
49        "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
50        "li         %[tmp0],    "#r4"                                 \n\t" \
51        "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
52        "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
53        "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
54        "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
55        "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
56        "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
57        "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
58        "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
59                                                                            \
60        "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
61        "paddw      %[ftmp2],   %[ftmp2],   "#c0"                     \n\t" \
62        "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
63        "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
64        "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
65        "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
66        "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
67        "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
68        "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
69        "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
70        "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
71        "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
72        "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
73        "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
74        "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
75        "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
76
77#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)              \
78        "li         %[tmp0],    "#r1"                                 \n\t" \
79        "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
80        "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
81        "li         %[tmp0],    "#r2"                                 \n\t" \
82        "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
83        "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
84        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
85        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
86        "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
87        "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
88        "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
89        "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
90                                                                            \
91        "li         %[tmp0],    "#r3"                                 \n\t" \
92        "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
93        "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
94        "li         %[tmp0],    "#r4"                                 \n\t" \
95        "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
96        "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
97        "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
98        "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
99        "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
100        "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
101        "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
102        "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
103                                                                            \
104        "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
105        "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
106        "paddw      %[ftmp14],  %[ftmp14],  "#c1"                     \n\t" \
107        "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
108        "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
109        "paddw      %[ftmp3],   %[ftmp3],   "#c1"                     \n\t" \
110        "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
111        "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
112        "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
113        "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
114        "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
115        "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
116        "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
117        "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
118        "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
119        "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
120        "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
121        "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
122        "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
123        "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
124
125/* Do inverse transform on 8x8 block */
126void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
127{
128    int dc = block[0];
129    double ftmp[9];
130    mips_reg addr[1];
131    int count;
132    union mmi_intfloat64 dc_u;
133
134    dc = (3 * dc +  1) >> 1;
135    dc = (3 * dc + 16) >> 5;
136    dc_u.i = dc;
137
138    __asm__ volatile(
139        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
140        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
141        "li         %[count],   0x02                                    \n\t"
142
143        "1:                                                             \n\t"
144        MMI_LDC1(%[ftmp1], %[dest], 0x00)
145        PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
146        MMI_LDC1(%[ftmp2], %[addr0], 0x00)
147        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
148        MMI_LDC1(%[ftmp3], %[addr0], 0x00)
149        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
150        MMI_LDC1(%[ftmp4], %[addr0], 0x00)
151
152        "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
153        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
154        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
155        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
156        "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
157        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
158        "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
159        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
160
161        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
162        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
163        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
164        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
165        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
166        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
167        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
168        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
169
170        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
171        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
172        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
173        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
174
175        MMI_SDC1(%[ftmp1], %[dest], 0x00)
176        PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
177        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
178        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
179        MMI_SDC1(%[ftmp3], %[addr0], 0x00)
180        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
181        MMI_SDC1(%[ftmp4], %[addr0], 0x00)
182
183        "addiu      %[count],   %[count],       -0x01                   \n\t"
184        PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
185        "bnez       %[count],   1b                                      \n\t"
186        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
187          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
188          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
189          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
190          [ftmp8]"=&f"(ftmp[8]),
191          [addr0]"=&r"(addr[0]),
192          [count]"=&r"(count),          [dest]"+&r"(dest)
193        : [linesize]"r"((mips_reg)linesize),
194          [dc]"f"(dc_u.f)
195        : "memory"
196    );
197}
198
199#if _MIPS_SIM != _ABIO32
200void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
201{
202    DECLARE_ALIGNED(16, int16_t, temp[64]);
203    double ftmp[23];
204    uint64_t tmp[1];
205
206    __asm__ volatile (
207        /* 1st loop: start */
208        "li         %[tmp0],    0x03                                    \n\t"
209        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
210
211       // 1st part
212        MMI_LDC1(%[ftmp1], %[block], 0x00)
213        MMI_LDC1(%[ftmp11], %[block], 0x10)
214        MMI_LDC1(%[ftmp2], %[block], 0x20)
215        MMI_LDC1(%[ftmp12], %[block], 0x30)
216        MMI_LDC1(%[ftmp3], %[block], 0x40)
217        MMI_LDC1(%[ftmp13], %[block], 0x50)
218        MMI_LDC1(%[ftmp4], %[block], 0x60)
219        MMI_LDC1(%[ftmp14], %[block], 0x70)
220        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
221        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
222        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
223        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
224
225        "punpcklhw  %[ftmp9],  %[ftmp11],  %[ftmp12]                    \n\t"
226        "punpckhhw  %[ftmp10], %[ftmp11],  %[ftmp12]                    \n\t"
227        "punpcklhw  %[ftmp11], %[ftmp13],  %[ftmp14]                    \n\t"
228        "punpckhhw  %[ftmp12], %[ftmp13],  %[ftmp14]                    \n\t"
229
230        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
231        VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
232                               0x000f0010, 0x00040009, %[ff_pw_4])
233
234        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
235        VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
236                               0xfffc000f, 0xfff7fff0, %[ff_pw_4])
237
238        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
239        VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
240                               0xfff00009, 0x000f0004, %[ff_pw_4])
241
242        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
243        VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
244                               0xfff70004, 0xfff0000f, %[ff_pw_4])
245
246        TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
247                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
248
249        TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
250                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
251
252        MMI_SDC1(%[ftmp15], %[temp], 0x00)
253        MMI_SDC1(%[ftmp19], %[temp], 0x08)
254        MMI_SDC1(%[ftmp16], %[temp], 0x10)
255        MMI_SDC1(%[ftmp20], %[temp], 0x18)
256        MMI_SDC1(%[ftmp17], %[temp], 0x20)
257        MMI_SDC1(%[ftmp21], %[temp], 0x28)
258        MMI_SDC1(%[ftmp18], %[temp], 0x30)
259        MMI_SDC1(%[ftmp22], %[temp], 0x38)
260
261       // 2nd part
262        MMI_LDC1(%[ftmp1], %[block], 0x08)
263        MMI_LDC1(%[ftmp11], %[block], 0x18)
264        MMI_LDC1(%[ftmp2], %[block], 0x28)
265        MMI_LDC1(%[ftmp12], %[block], 0x38)
266        MMI_LDC1(%[ftmp3], %[block], 0x48)
267        MMI_LDC1(%[ftmp13], %[block], 0x58)
268        MMI_LDC1(%[ftmp4], %[block], 0x68)
269        MMI_LDC1(%[ftmp14], %[block], 0x78)
270        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
271        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
272        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
273        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
274
275        "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
276        "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
277        "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
278        "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
279
280        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
281        VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
282                               0x000f0010, 0x00040009, %[ff_pw_4])
283
284        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
285        VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
286                               0xfffc000f, 0xfff7fff0, %[ff_pw_4])
287
288        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
289        VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
290                               0xfff00009, 0x000f0004, %[ff_pw_4])
291
292        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
293        VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
294                               0xfff70004, 0xfff0000f, %[ff_pw_4])
295
296        TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
297                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
298
299        TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
300                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
301
302        MMI_SDC1(%[ftmp19], %[temp], 0x48)
303        MMI_SDC1(%[ftmp20], %[temp], 0x58)
304        MMI_SDC1(%[ftmp21], %[temp], 0x68)
305        MMI_SDC1(%[ftmp22], %[temp], 0x78)
306        /* 1st loop: end */
307
308        /* 2nd loop: start */
309        "li         %[tmp0],    0x07                                    \n\t"
310        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
311
312        // 1st part
313        MMI_LDC1(%[ftmp1], %[temp], 0x00)
314        MMI_LDC1(%[ftmp11], %[temp], 0x10)
315        MMI_LDC1(%[ftmp2], %[temp], 0x20)
316        MMI_LDC1(%[ftmp12], %[temp], 0x30)
317        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
318        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
319        "punpcklhw  %[ftmp7],   %[ftmp15],  %[ftmp17]                   \n\t"
320        "punpckhhw  %[ftmp8],   %[ftmp15],  %[ftmp17]                   \n\t"
321
322        "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
323        "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
324        "punpcklhw  %[ftmp11],  %[ftmp16],  %[ftmp18]                   \n\t"
325        "punpckhhw  %[ftmp12],  %[ftmp16],  %[ftmp18]                   \n\t"
326
327        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
328        VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
329                               0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
330
331        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
332        VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
333                               0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
334
335        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
336        VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
337                               0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
338
339        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
340        VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
341                               0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
342
343        MMI_SDC1(%[ftmp15], %[block], 0x00)
344        MMI_SDC1(%[ftmp16], %[block], 0x10)
345        MMI_SDC1(%[ftmp17], %[block], 0x20)
346        MMI_SDC1(%[ftmp18], %[block], 0x30)
347        MMI_SDC1(%[ftmp19], %[block], 0x40)
348        MMI_SDC1(%[ftmp20], %[block], 0x50)
349        MMI_SDC1(%[ftmp21], %[block], 0x60)
350        MMI_SDC1(%[ftmp22], %[block], 0x70)
351
352       // 2nd part
353        MMI_LDC1(%[ftmp1], %[temp], 0x08)
354        MMI_LDC1(%[ftmp11], %[temp], 0x18)
355        MMI_LDC1(%[ftmp2], %[temp], 0x28)
356        MMI_LDC1(%[ftmp12], %[temp], 0x38)
357        MMI_LDC1(%[ftmp3], %[temp], 0x48)
358        MMI_LDC1(%[ftmp13], %[temp], 0x58)
359        MMI_LDC1(%[ftmp4], %[temp], 0x68)
360        MMI_LDC1(%[ftmp14], %[temp], 0x78)
361        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
362        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
363        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
364        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
365
366        "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
367        "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
368        "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
369        "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
370
371        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
372        VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
373                               0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
374
375        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
376        VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
377                               0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
378
379        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
380        VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
381                               0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
382
383        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
384        VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
385                               0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
386
387        MMI_SDC1(%[ftmp15], %[block], 0x08)
388        MMI_SDC1(%[ftmp16], %[block], 0x18)
389        MMI_SDC1(%[ftmp17], %[block], 0x28)
390        MMI_SDC1(%[ftmp18], %[block], 0x38)
391        MMI_SDC1(%[ftmp19], %[block], 0x48)
392        MMI_SDC1(%[ftmp20], %[block], 0x58)
393        MMI_SDC1(%[ftmp21], %[block], 0x68)
394        MMI_SDC1(%[ftmp22], %[block], 0x78)
395        /* 2nd loop: end */
396        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
397          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
398          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
399          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
400          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
401          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
402          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
403          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
404          [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
405          [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
406          [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
407          [ftmp22]"=&f"(ftmp[22]),
408          [tmp0]"=&r"(tmp[0])
409        : [ff_pw_1]"f"(ff_pw_32_1.f),   [ff_pw_64]"f"(ff_pw_32_64.f),
410          [ff_pw_4]"f"(ff_pw_32_4.f),   [block]"r"(block),
411          [temp]"r"(temp)
412        : "memory"
413    );
414}
415#endif
416
417/* Do inverse transform on 8x4 part of block */
418void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
419{
420    int dc = block[0];
421    double ftmp[9];
422    union mmi_intfloat64 dc_u;
423
424    dc = ( 3 * dc +  1) >> 1;
425    dc = (17 * dc + 64) >> 7;
426    dc_u.i = dc;
427
428    __asm__ volatile(
429        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
430        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
431
432        MMI_LDC1(%[ftmp1], %[dest0], 0x00)
433        MMI_LDC1(%[ftmp2], %[dest1], 0x00)
434        MMI_LDC1(%[ftmp3], %[dest2], 0x00)
435        MMI_LDC1(%[ftmp4], %[dest3], 0x00)
436
437        "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
438        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
439        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
440        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
441        "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
442        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
443        "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
444        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
445
446        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
447        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
448        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
449        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
450        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
451        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
452        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
453        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
454
455        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
456        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
457        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
458        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
459
460        MMI_SDC1(%[ftmp1], %[dest0], 0x00)
461        MMI_SDC1(%[ftmp2], %[dest1], 0x00)
462        MMI_SDC1(%[ftmp3], %[dest2], 0x00)
463        MMI_SDC1(%[ftmp4], %[dest3], 0x00)
464        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
465          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
466          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
467          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
468          [ftmp8]"=&f"(ftmp[8])
469        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
470          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
471          [dc]"f"(dc_u.f)
472        : "memory"
473    );
474}
475
476#if _MIPS_SIM != _ABIO32
477void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
478{
479    int16_t *src = block;
480    int16_t *dst = block;
481    double ftmp[16];
482    uint32_t tmp[1];
483    int16_t count = 4;
484    int16_t coeff[64] = {12, 16,  16,  15,  12,   9,   6,   4,
485                         12, 15,   6,  -4, -12, -16, -16,  -9,
486                         12,  9,  -6, -16, -12,   4,  16,  15,
487                         12,  4, -16,  -9,  12,  15,  -6, -16,
488                         12, -4, -16,   9,  12, -15,  -6,  16,
489                         12, -9,  -6,  16, -12,  -4,  16, -15,
490                         12, -15,  6,   4, -12,  16, -16,   9,
491                         12, -16, 16, -15,  12,  -9,   6,  -4};
492
493    // 1st loop
494    __asm__ volatile (
495        "li         %[tmp0],    0x03                                    \n\t"
496        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
497
498        "1:                                                             \n\t"
499        MMI_LDC1(%[ftmp1], %[src], 0x00)
500        MMI_LDC1(%[ftmp2], %[src], 0x08)
501
502        /* ftmp11: dst1,dst0 */
503        MMI_LDC1(%[ftmp3], %[coeff], 0x00)
504        MMI_LDC1(%[ftmp4], %[coeff], 0x08)
505        MMI_LDC1(%[ftmp5], %[coeff], 0x10)
506        MMI_LDC1(%[ftmp6], %[coeff], 0x18)
507        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
508        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
509        "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
510        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
511        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
512        "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
513        "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
514        "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
515        "paddw      %[ftmp11],  %[ftmp7],   %[ftmp8]                    \n\t"
516        "paddw      %[ftmp11],  %[ftmp11],  %[ff_pw_4]                  \n\t"
517
518        /* ftmp12: dst3,dst2 */
519        MMI_LDC1(%[ftmp3], %[coeff], 0x20)
520        MMI_LDC1(%[ftmp4], %[coeff], 0x28)
521        MMI_LDC1(%[ftmp5], %[coeff], 0x30)
522        MMI_LDC1(%[ftmp6], %[coeff], 0x38)
523        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
524        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
525        "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
526        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
527        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
528        "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
529        "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
530        "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
531        "paddw      %[ftmp12],  %[ftmp7],   %[ftmp8]                    \n\t"
532        "paddw      %[ftmp12],  %[ftmp12],  %[ff_pw_4]                  \n\t"
533
534        /* ftmp13: dst5,dst4 */
535        MMI_LDC1(%[ftmp3], %[coeff], 0x40)
536        MMI_LDC1(%[ftmp4], %[coeff], 0x48)
537        MMI_LDC1(%[ftmp5], %[coeff], 0x50)
538        MMI_LDC1(%[ftmp6], %[coeff], 0x58)
539        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
540        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
541        "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
542        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
543        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
544        "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
545        "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
546        "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
547        "paddw      %[ftmp13],  %[ftmp7],   %[ftmp8]                    \n\t"
548        "paddw      %[ftmp13],  %[ftmp13],  %[ff_pw_4]                  \n\t"
549
550        /* ftmp14: dst7,dst6 */
551        MMI_LDC1(%[ftmp3], %[coeff], 0x60)
552        MMI_LDC1(%[ftmp4], %[coeff], 0x68)
553        MMI_LDC1(%[ftmp5], %[coeff], 0x70)
554        MMI_LDC1(%[ftmp6], %[coeff], 0x78)
555        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
556        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
557        "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
558        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
559        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
560        "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
561        "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
562        "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
563        "paddw      %[ftmp14],  %[ftmp7],   %[ftmp8]                    \n\t"
564        "paddw      %[ftmp14],  %[ftmp14],  %[ff_pw_4]                  \n\t"
565
566        /* ftmp9: dst3,dst2,dst1,dst0    ftmp10: dst7,dst6,dst5,dst4 */
567        "psraw      %[ftmp11],  %[ftmp11],  %[ftmp0]                    \n\t"
568        "psraw      %[ftmp12],  %[ftmp12],  %[ftmp0]                    \n\t"
569        "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                    \n\t"
570        "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                    \n\t"
571        "punpcklhw  %[ftmp7],   %[ftmp11],  %[ftmp12]                   \n\t"
572        "punpckhhw  %[ftmp8],   %[ftmp11],  %[ftmp12]                   \n\t"
573        "punpcklhw  %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
574        "punpcklhw  %[ftmp7],   %[ftmp13],  %[ftmp14]                   \n\t"
575        "punpckhhw  %[ftmp8],   %[ftmp13],  %[ftmp14]                   \n\t"
576        "punpcklhw  %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
577        MMI_SDC1(%[ftmp9], %[dst], 0x00)
578        MMI_SDC1(%[ftmp10], %[dst], 0x08)
579
580        PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
581        PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
582        "addiu      %[count],   %[count],   -0x01                       \n\t"
583        "bnez       %[count],   1b                                      \n\t"
584        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
585          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
586          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
587          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
588          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
589          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
590          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
591          [ftmp14]"=&f"(ftmp[14]),      [tmp0]"=&r"(tmp[0]),
592          [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
593        : [ff_pw_4]"f"(ff_pw_32_4.f),   [coeff]"r"(coeff)
594        : "memory"
595    );
596
597    src = block;
598
599    // 2nd loop
600    __asm__ volatile (
601        "li         %[tmp0],    0x44                                    \n\t"
602        "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
603
604        // 1st part
605        "li         %[tmp0],    0x07                                    \n\t"
606        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
607        MMI_LDC1(%[ftmp1], %[src], 0x00)
608        MMI_LDC1(%[ftmp2], %[src], 0x10)
609        MMI_LDC1(%[ftmp3], %[src], 0x20)
610        MMI_LDC1(%[ftmp4], %[src], 0x30)
611        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
612        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
613        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
614        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
615
616        /* ftmp11: dst03,dst02,dst01,dst00 */
617        "li         %[tmp0],    0x00160011                              \n\t"
618        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
619        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
620        "li         %[tmp0],    0x000a0011                              \n\t"
621        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
622        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
623        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
624        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
625        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
626        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
627        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
628        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
629        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
630        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
631        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
632        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
633        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
634        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
635        "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
636
637        /* ftmp12: dst13,dst12,dst11,dst10 */
638        "li         %[tmp0],    0x000a0011                              \n\t"
639        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
640        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
641        "li         %[tmp0],    0xffeaffef                              \n\t"
642        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
643        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
644        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
645        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
646        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
647        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
648        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
649        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
650        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
651        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
652        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
653        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
654        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
655        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
656        "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
657
658        /* ftmp13: dst23,dst22,dst21,dst20 */
659        "li         %[tmp0],    0xfff60011                              \n\t"
660        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
661        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
662        "li         %[tmp0],    0x0016ffef                              \n\t"
663        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
664        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
665        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
666        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
667        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
668        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
669        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
670        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
671        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
672        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
673        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
674        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
675        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
676        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
677        "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
678
679        /* ftmp14: dst33,dst32,dst31,dst30 */
680        "li         %[tmp0],    0xffea0011                              \n\t"
681        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
682        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
683        "li         %[tmp0],    0xfff60011                              \n\t"
684        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
685        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
686        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
687        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
688        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
689        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
690        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
691        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
692        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
693        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
694        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
695        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
696        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
697        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
698        "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
699
700        MMI_LWC1(%[ftmp1], %[dest], 0x00)
701        PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
702        MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
703        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
704        MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
705        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
706        MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
707        "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
708        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
709        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
710        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
711        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
712        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
713        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
714        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
715        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
716        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
717        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
718        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
719        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
720        MMI_SWC1(%[ftmp1], %[dest], 0x00)
721        PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
722        MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
723        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
724        MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
725        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
726        MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
727
728        // 2nd part
729        "li         %[tmp0],    0x07                                    \n\t"
730        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
731        MMI_LDC1(%[ftmp1], %[src], 0x08)
732        MMI_LDC1(%[ftmp2], %[src], 0x18)
733        MMI_LDC1(%[ftmp3], %[src], 0x28)
734        MMI_LDC1(%[ftmp4], %[src], 0x38)
735        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
736        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
737        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
738        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
739
740        /* ftmp11: dst03,dst02,dst01,dst00 */
741        "li         %[tmp0],    0x00160011                              \n\t"
742        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
743        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
744        "li         %[tmp0],    0x000a0011                              \n\t"
745        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
746        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
747        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
748        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
749        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
750        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
751        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
752        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
753        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
754        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
755        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
756        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
757        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
758        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
759        "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
760
761        /* ftmp12: dst13,dst12,dst11,dst10 */
762        "li         %[tmp0],    0x000a0011                              \n\t"
763        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
764        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
765        "li         %[tmp0],    0xffeaffef                              \n\t"
766        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
767        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
768        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
769        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
770        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
771        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
772        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
773        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
774        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
775        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
776        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
777        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
778        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
779        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
780        "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
781
782        /* ftmp13: dst23,dst22,dst21,dst20 */
783        "li         %[tmp0],    0xfff60011                              \n\t"
784        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
785        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
786        "li         %[tmp0],    0x0016ffef                              \n\t"
787        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
788        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
789        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
790        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
791        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
792        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
793        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
794        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
795        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
796        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
797        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
798        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
799        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
800        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
801        "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
802
803        /* ftmp14: dst33,dst32,dst31,dst30 */
804        "li         %[tmp0],    0xffea0011                              \n\t"
805        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
806        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
807        "li         %[tmp0],    0xfff60011                              \n\t"
808        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
809        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
810        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
811        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
812        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
813        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
814        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
815        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
816        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
817        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
818        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
819        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
820        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
821        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
822        "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
823
824        MMI_LWC1(%[ftmp1], %[dest], 0x04)
825        PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
826        MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
827        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
828        MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
829        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
830        MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
831        "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
832        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
833        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
834        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
835        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
836        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
837        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
838        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
839        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
840        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
841        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
842        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
843        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
844        MMI_SWC1(%[ftmp1], %[dest], 0x04)
845        PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
846        MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
847        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
848        MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
849        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
850        MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
851
852        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
853          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
854          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
855          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
856          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
857          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
858          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
859          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
860          [tmp0]"=&r"(tmp[0])
861        : [ff_pw_64]"f"(ff_pw_32_64.f),
862          [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
863        :"memory"
864    );
865}
866#endif
867
868/* Do inverse transform on 4x8 parts of block */
869void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
870{
871    int dc = block[0];
872    double ftmp[9];
873    union mmi_intfloat64 dc_u;
874    DECLARE_VAR_LOW32;
875
876    dc = (17 * dc +  4) >> 3;
877    dc = (12 * dc + 64) >> 7;
878    dc_u.i = dc;
879
880    __asm__ volatile(
881        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
882        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
883
884        MMI_LWC1(%[ftmp1], %[dest0], 0x00)
885        MMI_LWC1(%[ftmp2], %[dest1], 0x00)
886        MMI_LWC1(%[ftmp3], %[dest2], 0x00)
887        MMI_LWC1(%[ftmp4], %[dest3], 0x00)
888        MMI_LWC1(%[ftmp5], %[dest4], 0x00)
889        MMI_LWC1(%[ftmp6], %[dest5], 0x00)
890        MMI_LWC1(%[ftmp7], %[dest6], 0x00)
891        MMI_LWC1(%[ftmp8], %[dest7], 0x00)
892
893        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
894        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
895        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
896        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
897        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
898        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
899        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
900        "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
901
902        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
903        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
904        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
905        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
906        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
907        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
908        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
909        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
910
911        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
912        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
913        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
914        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
915        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
916        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
917        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
918        "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
919
920        MMI_SWC1(%[ftmp1], %[dest0], 0x00)
921        MMI_SWC1(%[ftmp2], %[dest1], 0x00)
922        MMI_SWC1(%[ftmp3], %[dest2], 0x00)
923        MMI_SWC1(%[ftmp4], %[dest3], 0x00)
924        MMI_SWC1(%[ftmp5], %[dest4], 0x00)
925        MMI_SWC1(%[ftmp6], %[dest5], 0x00)
926        MMI_SWC1(%[ftmp7], %[dest6], 0x00)
927        MMI_SWC1(%[ftmp8], %[dest7], 0x00)
928        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
929          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
930          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
931          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
932          RESTRICT_ASM_LOW32
933          [ftmp8]"=&f"(ftmp[8])
934        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
935          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
936          [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
937          [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
938          [dc]"f"(dc_u.f)
939        : "memory"
940    );
941}
942
943#if _MIPS_SIM != _ABIO32
944void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
945{
946    int16_t *src = block;
947    int16_t *dst = block;
948    double ftmp[23];
949    uint64_t count = 8, tmp[1];
950    int16_t coeff[16] = {17, 22, 17, 10,
951                         17, 10,-17,-22,
952                         17,-10,-17, 22,
953                         17,-22, 17,-10};
954
955    // 1st loop
956    __asm__ volatile (
957
958        "li         %[tmp0],    0x03                                    \n\t"
959        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
960
961        MMI_LDC1(%[ftmp2], %[coeff], 0x00)
962        MMI_LDC1(%[ftmp3], %[coeff], 0x08)
963        MMI_LDC1(%[ftmp4], %[coeff], 0x10)
964        MMI_LDC1(%[ftmp5], %[coeff], 0x18)
965        "1:                                                             \n\t"
966        /* ftmp8: dst3,dst2,dst1,dst0 */
967        MMI_LDC1(%[ftmp1], %[src], 0x00)
968        "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
969        "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
970        "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
971        "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
972        "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
973        "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
974        "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
975        "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
976        "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
977        "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
978        "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
979        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
980        "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
981        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
982        "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
983        "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
984        "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
985        MMI_SDC1(%[ftmp8], %[dst], 0x00)
986
987        PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
988        PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
989        "addiu      %[count],   %[count],   -0x01                       \n\t"
990        "bnez       %[count],   1b                                      \n\t"
991        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
992          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
993          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
994          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
995          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
996          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
997          [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
998          [src]"+&r"(src),              [dst]"+&r"(dst)
999        : [ff_pw_4]"f"(ff_pw_32_4.f),   [coeff]"r"(coeff)
1000        : "memory"
1001    );
1002
1003    src = block;
1004
1005    // 2nd loop
1006    __asm__ volatile (
1007        "li         %[tmp0],    0x07                                    \n\t"
1008        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1009
1010        MMI_LDC1(%[ftmp1], %[src], 0x00)
1011        MMI_LDC1(%[ftmp2], %[src], 0x20)
1012        MMI_LDC1(%[ftmp3], %[src], 0x40)
1013        MMI_LDC1(%[ftmp4], %[src], 0x60)
1014        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1015        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1016        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1017        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1018
1019        MMI_LDC1(%[ftmp1], %[src], 0x10)
1020        MMI_LDC1(%[ftmp2], %[src], 0x30)
1021        MMI_LDC1(%[ftmp3], %[src], 0x50)
1022        MMI_LDC1(%[ftmp4], %[src], 0x70)
1023        "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1024        "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1025        "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
1026        "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
1027
1028        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1029        VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1030                               0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1031
1032        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1033        VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1034                               0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1035
1036        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1037        VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1038                               0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1039
1040        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1041        VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1042                               0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1043
1044        MMI_LWC1(%[ftmp1], %[dest], 0x00)
1045        PTR_ADDU  "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1046        MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1047        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1048        MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1049        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1050        MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1051        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1052        MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1053        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1054        MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1055        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1056        MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1057        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1058        MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1059        "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1060        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1061        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1062        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1063        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1064        "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1065        "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1066        "punpcklbh  %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1067        "punpcklbh  %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1068
1069        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp15]                   \n\t"
1070        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp16]                   \n\t"
1071        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp17]                   \n\t"
1072        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp18]                   \n\t"
1073        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp19]                   \n\t"
1074        "paddh      %[ftmp6],   %[ftmp6],   %[ftmp20]                   \n\t"
1075        "paddh      %[ftmp7],   %[ftmp7],   %[ftmp21]                   \n\t"
1076        "paddh      %[ftmp8],   %[ftmp8],   %[ftmp22]                   \n\t"
1077
1078        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1079        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1080        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1081        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1082        "packushb   %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1083        "packushb   %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1084        "packushb   %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1085        "packushb   %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1086
1087        MMI_SWC1(%[ftmp1], %[dest], 0x00)
1088        PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1089        MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1090        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1091        MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1092        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1093        MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1094        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1095        MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1096        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1097        MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1098        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1099        MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1100        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1101        MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1102
1103        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1104          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1105          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1106          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1107          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1108          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1109          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1110          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1111          [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
1112          [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
1113          [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
1114          [ftmp22]"=&f"(ftmp[22]),
1115          [tmp0]"=&r"(tmp[0])
1116        : [ff_pw_1]"f"(ff_pw_32_1.f),   [ff_pw_64]"f"(ff_pw_32_64.f),
1117          [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1118        : "memory"
1119    );
1120}
1121#endif
1122
1123/* Do inverse transform on 4x4 part of block */
1124void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1125{
1126    int dc = block[0];
1127    double ftmp[5];
1128    union mmi_intfloat64 dc_u;
1129    DECLARE_VAR_LOW32;
1130
1131    dc = (17 * dc +  4) >> 3;
1132    dc = (17 * dc + 64) >> 7;
1133    dc_u.i = dc;
1134
1135    __asm__ volatile(
1136        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1137        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
1138
1139        MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140        MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141        MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142        MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1143
1144        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1145        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1146        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1147        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1148
1149        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
1150        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
1151        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
1152        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
1153
1154        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1155        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1156        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1157        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1158
1159        MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160        MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161        MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162        MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1164          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1165          RESTRICT_ASM_LOW32
1166          [ftmp4]"=&f"(ftmp[4])
1167        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
1168          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
1169          [dc]"f"(dc_u.f)
1170        : "memory"
1171    );
1172}
1173
1174void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1175{
1176    int16_t *src = block;
1177    int16_t *dst = block;
1178    double ftmp[16];
1179    uint32_t count = 4, tmp[1];
1180    int16_t coeff[16] = {17, 22, 17, 10,
1181                         17, 10,-17,-22,
1182                         17,-10,-17, 22,
1183                         17,-22, 17,-10};
1184    // 1st loop
1185    __asm__ volatile (
1186
1187        "li         %[tmp0],    0x03                                    \n\t"
1188        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1189        MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1190        MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1191        MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1192        MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1193        "1:                                                             \n\t"
1194        /* ftmp8: dst3,dst2,dst1,dst0 */
1195        MMI_LDC1(%[ftmp1], %[src], 0x00)
1196        "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
1197        "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
1198        "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
1199        "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
1200        "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
1201        "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
1202        "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1203        "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1204        "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
1205        "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
1206        "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
1207        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
1208        "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1209        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1210        "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1211        "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1212        "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
1213        MMI_SDC1(%[ftmp8], %[dst], 0x00)
1214
1215        PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
1216        PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
1217        "addiu      %[count],   %[count],   -0x01                       \n\t"
1218        "bnez       %[count],   1b                                      \n\t"
1219        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1220          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1221          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1222          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1223          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1224          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1225          [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1226          [src]"+&r"(src),              [dst]"+&r"(dst)
1227        : [ff_pw_4]"f"(ff_pw_32_4.f),   [coeff]"r"(coeff)
1228        : "memory"
1229    );
1230
1231    src = block;
1232
1233    // 2nd loop
1234    __asm__ volatile (
1235        "li         %[tmp0],    0x07                                    \n\t"
1236        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1237        "li         %[tmp0],    0x44                                    \n\t"
1238        "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
1239
1240        MMI_LDC1(%[ftmp1], %[src], 0x00)
1241        MMI_LDC1(%[ftmp2], %[src], 0x10)
1242        MMI_LDC1(%[ftmp3], %[src], 0x20)
1243        MMI_LDC1(%[ftmp4], %[src], 0x30)
1244        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1245        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1246        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1247        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1248
1249        /* ftmp11: dst03,dst02,dst01,dst00 */
1250        "li         %[tmp0],    0x00160011                              \n\t"
1251        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1252        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1253        "li         %[tmp0],    0x000a0011                              \n\t"
1254        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1255        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1256        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1257        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1258        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1259        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1260        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1261        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1262        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1263        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1264        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1265        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1266        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1267        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1268        "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
1269
1270        /* ftmp12: dst13,dst12,dst11,dst10 */
1271        "li         %[tmp0],    0x000a0011                              \n\t"
1272        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1273        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1274        "li         %[tmp0],    0xffeaffef                              \n\t"
1275        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1276        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1277        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1278        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1279        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1280        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1281        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1282        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1283        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1284        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1285        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1286        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1287        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1288        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1289        "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
1290
1291        /* ftmp13: dst23,dst22,dst21,dst20 */
1292        "li         %[tmp0],    0xfff60011                              \n\t"
1293        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1294        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1295        "li         %[tmp0],    0x0016ffef                              \n\t"
1296        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1297        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1298        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1299        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1300        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1301        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1302        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1303        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1304        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1305        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1306        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1307        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1308        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1309        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1310        "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
1311
1312        /* ftmp14: dst33,dst32,dst31,dst30 */
1313        "li         %[tmp0],    0xffea0011                              \n\t"
1314        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1315        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1316        "li         %[tmp0],    0xfff60011                              \n\t"
1317        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1318        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1319        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1320        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1321        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1322        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1323        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1324        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1325        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1326        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1327        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1328        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1329        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1330        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1331        "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
1332
1333        MMI_LWC1(%[ftmp1], %[dest], 0x00)
1334        PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1335        MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1336        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1337        MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1338        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1339        MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1340        "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1341        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1342        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1343        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1344        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1345        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
1346        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
1347        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
1348        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
1349        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1350        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1351        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1352        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1353
1354        MMI_SWC1(%[ftmp1], %[dest], 0x00)
1355        PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1356        MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1357        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1358        MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1359        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1360        MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1361
1362        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1363          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1364          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1365          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1366          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1367          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1368          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1369          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1370          [tmp0]"=&r"(tmp[0])
1371        : [ff_pw_64]"f"(ff_pw_32_64.f),
1372          [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1373        :"memory"
1374    );
1375}
1376
1377/* Apply overlap transform to horizontal edge */
1378void ff_vc1_h_overlap_mmi(uint8_t *src, ptrdiff_t stride)
1379{
1380    int i;
1381    int a, b, c, d;
1382    int d1, d2;
1383    int rnd = 1;
1384    for (i = 0; i < 8; i++) {
1385        a  = src[-2];
1386        b  = src[-1];
1387        c  = src[0];
1388        d  = src[1];
1389        d1 = (a - d + 3 + rnd) >> 3;
1390        d2 = (a - d + b - c + 4 - rnd) >> 3;
1391
1392        src[-2] = a - d1;
1393        src[-1] = av_clip_uint8(b - d2);
1394        src[0]  = av_clip_uint8(c + d2);
1395        src[1]  = d + d1;
1396        src    += stride;
1397        rnd     = !rnd;
1398    }
1399}
1400
1401void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, ptrdiff_t left_stride, ptrdiff_t right_stride, int flags)
1402{
1403    int i;
1404    int a, b, c, d;
1405    int d1, d2;
1406    int rnd1 = flags & 2 ? 3 : 4;
1407    int rnd2 = 7 - rnd1;
1408    for (i = 0; i < 8; i++) {
1409        a  = left[6];
1410        b  = left[7];
1411        c  = right[0];
1412        d  = right[1];
1413        d1 = a - d;
1414        d2 = a - d + b - c;
1415
1416        left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1417        left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1418        right[0] = ((c << 3) + d2 + rnd1) >> 3;
1419        right[1] = ((d << 3) + d1 + rnd2) >> 3;
1420
1421        right += right_stride;
1422        left  += left_stride;
1423        if (flags & 1) {
1424            rnd2   = 7 - rnd2;
1425            rnd1   = 7 - rnd1;
1426        }
1427    }
1428}
1429
1430/* Apply overlap transform to vertical edge */
1431void ff_vc1_v_overlap_mmi(uint8_t *src, ptrdiff_t stride)
1432{
1433    int i;
1434    int a, b, c, d;
1435    int d1, d2;
1436    int rnd = 1;
1437    for (i = 0; i < 8; i++) {
1438        a  = src[-2 * stride];
1439        b  = src[-stride];
1440        c  = src[0];
1441        d  = src[stride];
1442        d1 = (a - d + 3 + rnd) >> 3;
1443        d2 = (a - d + b - c + 4 - rnd) >> 3;
1444
1445        src[-2 * stride] = a - d1;
1446        src[-stride]     = av_clip_uint8(b - d2);
1447        src[0]           = av_clip_uint8(c + d2);
1448        src[stride]      = d + d1;
1449        src++;
1450        rnd = !rnd;
1451    }
1452}
1453
1454void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1455{
1456    int i;
1457    int a, b, c, d;
1458    int d1, d2;
1459    int rnd1 = 4, rnd2 = 3;
1460    for (i = 0; i < 8; i++) {
1461        a  = top[48];
1462        b  = top[56];
1463        c  = bottom[0];
1464        d  = bottom[8];
1465        d1 = a - d;
1466        d2 = a - d + b - c;
1467
1468        top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1469        top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1470        bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1471        bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1472
1473        bottom++;
1474        top++;
1475        rnd2 = 7 - rnd2;
1476        rnd1 = 7 - rnd1;
1477    }
1478}
1479
1480/**
1481 * VC-1 in-loop deblocking filter for one line
1482 * @param src source block type
1483 * @param stride block stride
1484 * @param pq block quantizer
1485 * @return whether other 3 pairs should be filtered or not
1486 * @see 8.6
1487 */
1488static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1489{
1490    int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1491              5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1492    int a0_sign = a0 >> 31;        /* Store sign */
1493
1494    a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1495    if (a0 < pq) {
1496        int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1497                        5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1498        int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1499                        5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1500        if (a1 < a0 || a2 < a0) {
1501            int clip      = src[-1 * stride] - src[0 * stride];
1502            int clip_sign = clip >> 31;
1503
1504            clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1505            if (clip) {
1506                int a3     = FFMIN(a1, a2);
1507                int d      = 5 * (a3 - a0);
1508                int d_sign = (d >> 31);
1509
1510                d       = ((d ^ d_sign) - d_sign) >> 3;
1511                d_sign ^= a0_sign;
1512
1513                if (d_sign ^ clip_sign)
1514                    d = 0;
1515                else {
1516                    d = FFMIN(d, clip);
1517                    d = (d ^ d_sign) - d_sign; /* Restore sign */
1518                    src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1519                    src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1520                }
1521                return 1;
1522            }
1523        }
1524    }
1525    return 0;
1526}
1527
1528/**
1529 * VC-1 in-loop deblocking filter
1530 * @param src source block type
1531 * @param step distance between horizontally adjacent elements
1532 * @param stride distance between vertically adjacent elements
1533 * @param len edge length to filter (4 or 8 pixels)
1534 * @param pq block quantizer
1535 * @see 8.6
1536 */
1537static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1538                                   int len, int pq)
1539{
1540    int i;
1541    int filt3;
1542
1543    for (i = 0; i < len; i += 4) {
1544        filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1545        if (filt3) {
1546            vc1_filter_line(src + 0 * step, stride, pq);
1547            vc1_filter_line(src + 1 * step, stride, pq);
1548            vc1_filter_line(src + 3 * step, stride, pq);
1549        }
1550        src += step * 4;
1551    }
1552}
1553
1554void ff_vc1_v_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1555{
1556    vc1_loop_filter(src, 1, stride, 4, pq);
1557}
1558
1559void ff_vc1_h_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1560{
1561    vc1_loop_filter(src, stride, 1, 4, pq);
1562}
1563
1564void ff_vc1_v_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1565{
1566    vc1_loop_filter(src, 1, stride, 8, pq);
1567}
1568
1569void ff_vc1_h_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1570{
1571    vc1_loop_filter(src, stride, 1, 8, pq);
1572}
1573
1574void ff_vc1_v_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1575{
1576    vc1_loop_filter(src, 1, stride, 16, pq);
1577}
1578
1579void ff_vc1_h_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1580{
1581    vc1_loop_filter(src, stride, 1, 16, pq);
1582}
1583
1584void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1585                               ptrdiff_t stride, int rnd)
1586{
1587    ff_put_pixels8_8_mmi(dst, src, stride, 8);
1588}
1589void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1590                                  ptrdiff_t stride, int rnd)
1591{
1592    ff_put_pixels16_8_mmi(dst, src, stride, 16);
1593}
1594void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1595                               ptrdiff_t stride, int rnd)
1596{
1597    ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1598}
1599void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1600                                  ptrdiff_t stride, int rnd)
1601{
1602    ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1603}
1604
1605#define OP_PUT(S, D)
1606#define OP_AVG(S, D)                                                        \
1607    "ldc1       $f16,   "#S"                        \n\t"                   \
1608    "pavgb      "#D",   "#D",   $f16                \n\t"
1609
1610/** Add rounder from $f14 to $f6 and pack result at destination */
1611#define NORMALIZE_MMI(SHIFT)                                                \
1612    "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1613    "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1614    "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1615    "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1616
1617#define TRANSFER_DO_PACK(OP)                                                \
1618    "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1619    OP((%[dst]), $f6)                                                       \
1620    "sdc1       $f6,    0x00(%[dst])                \n\t"
1621
1622#define TRANSFER_DONT_PACK(OP)                                              \
1623     OP(0(%[dst]), $f6)                                                     \
1624     OP(8(%[dst]), $f8)                                                     \
1625     "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1626     "sdc1      $f8,    0x08(%[dst])                \n\t"
1627
1628/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1629#define DO_UNPACK(reg)                                                      \
1630    "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1631#define DONT_UNPACK(reg)
1632
1633/** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1634#define LOAD_ROUNDER_MMI(ROUND)                                             \
1635    "lwc1       $f14,   "ROUND"                     \n\t"                   \
1636    "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1637    "punpcklwd  $f14,   $f14,   $f14                \n\t"
1638
1639
1640#define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1641    "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1642    PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1643    MMI_ULWC1(R0, $9, 0x00)                                                 \
1644    "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1645    "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1646    PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1647    MMI_ULWC1(R3, $9, 0x00)                                                 \
1648    "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1649    "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1650    "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1651    "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1652    "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1653    MMI_SDC1(R1, %[dst], OFF)                                               \
1654    PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1655
1656/** Sacrificing $f12 makes it possible to pipeline loads from src */
1657static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1658                                       const uint8_t *src, mips_reg stride,
1659                                       int rnd, int64_t shift)
1660{
1661    union mmi_intfloat64 shift_u;
1662    DECLARE_VAR_LOW32;
1663    DECLARE_VAR_ADDRT;
1664    shift_u.i = shift;
1665
1666    __asm__ volatile(
1667        "pxor       $f0,    $f0,    $f0             \n\t"
1668        "li         $8,     0x03                    \n\t"
1669        LOAD_ROUNDER_MMI("%[rnd]")
1670        "1:                                         \n\t"
1671        MMI_ULWC1($f4, %[src], 0x00)
1672        PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1673        MMI_ULWC1($f6, %[src], 0x00)
1674        "punpcklbh  $f4,    $f4,    $f0             \n\t"
1675        "punpcklbh  $f6,    $f6,    $f0             \n\t"
1676        SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1677        SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1678        SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1679        SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1680        SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1681        SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1682        SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1683        SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1684        PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1685        PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1686        "addiu      $8,     $8,    -0x01            \n\t"
1687        "bnez       $8,     1b                      \n\t"
1688        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1689          [src]"+r"(src),               [dst]"+r"(dst)
1690        : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1691          [shift]"f"(shift_u.f),        [rnd]"m"(rnd),
1692          [stride2]"r"(9*stride-4)
1693        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
1694          "$f14", "$f16", "memory"
1695    );
1696}
1697
1698/**
1699 * Data is already unpacked, so some operations can directly be made from
1700 * memory.
1701 */
1702#define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1703static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1704                                             const int16_t *src, int rnd)   \
1705{                                                                           \
1706    int h = 8;                                                              \
1707    DECLARE_VAR_ALL64;                                                      \
1708    DECLARE_VAR_ADDRT;                                                      \
1709                                                                            \
1710    src -= 1;                                                               \
1711    rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1712                                                                            \
1713    __asm__ volatile(                                                       \
1714        LOAD_ROUNDER_MMI("%[rnd]")                                          \
1715        "1:                                         \n\t"                   \
1716        MMI_ULDC1($f2, %[src], 0x00)                                        \
1717        MMI_ULDC1($f4, %[src], 0x08)                                        \
1718        MMI_ULDC1($f6, %[src], 0x02)                                        \
1719        MMI_ULDC1($f8, %[src], 0x0a)                                        \
1720        MMI_ULDC1($f0, %[src], 0x06)                                        \
1721        "paddh      $f2,    $f2,    $f0             \n\t"                   \
1722        MMI_ULDC1($f0, %[src], 0x0e)                                        \
1723        "paddh      $f4,    $f4,    $f0             \n\t"                   \
1724        MMI_ULDC1($f0, %[src], 0x04)                                        \
1725        "paddh      $f6,    $f6,    $f0             \n\t"                   \
1726        MMI_ULDC1($f0, %[src], 0x0b)                                        \
1727        "paddh      $f8,    $f8,    $f0             \n\t"                   \
1728        "pmullh     $f6,    $f6,    %[ff_pw_9]      \n\t"                   \
1729        "pmullh     $f8,    $f8,    %[ff_pw_9]      \n\t"                   \
1730        "psubh      $f6,    $f6,    $f2             \n\t"                   \
1731        "psubh      $f8,    $f8,    $f4             \n\t"                   \
1732        "li         $8,     0x07                    \n\t"                   \
1733        "mtc1       $8,     $f16                    \n\t"                   \
1734        NORMALIZE_MMI("$f16")                                               \
1735        /* Remove bias */                                                   \
1736        "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1737        "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1738        TRANSFER_DO_PACK(OP)                                                \
1739        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1740        PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1741        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1742        "bnez       %[h],   1b                      \n\t"                   \
1743        : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1744          [h]"+r"(h),                                                       \
1745          [src]"+r"(src),               [dst]"+r"(dst)                      \
1746        : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1747          [ff_pw_9]"f"(ff_pw_9.f),      [ff_pw_128]"f"(ff_pw_128.f)         \
1748        : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f14",                  \
1749          "$f16", "memory"                                                  \
1750    );                                                                      \
1751}
1752
1753VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1754VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1755
1756/**
1757 * Purely vertical or horizontal 1/2 shift interpolation.
1758 * Sacrify $f12 for *9 factor.
1759 */
1760#define VC1_SHIFT2(OP, OPNAME)\
1761static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1762                                     mips_reg stride, int rnd,              \
1763                                     mips_reg offset)                       \
1764{                                                                           \
1765    DECLARE_VAR_LOW32;                                                      \
1766    DECLARE_VAR_ADDRT;                                                      \
1767                                                                            \
1768    rnd = 8 - rnd;                                                          \
1769                                                                            \
1770    __asm__ volatile(                                                       \
1771        "pxor       $f0,    $f0,    $f0             \n\t"                   \
1772        "li         $10,    0x08                    \n\t"                   \
1773        LOAD_ROUNDER_MMI("%[rnd]")                                          \
1774        "1:                                         \n\t"                   \
1775        MMI_ULWC1($f6, %[src], 0x00)                                        \
1776        MMI_ULWC1($f8, %[src], 0x04)                                        \
1777        PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1778        MMI_ULWC1($f2, $9, 0x00)                                            \
1779        MMI_ULWC1($f4, $9, 0x04)                                            \
1780        PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1781        "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1782        "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1783        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1784        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1785        "paddh      $f6,    $f6,    $f2             \n\t"                   \
1786        "paddh      $f8,    $f8,    $f4             \n\t"                   \
1787        PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1788        MMI_ULWC1($f2, $9, 0x00)                                            \
1789        MMI_ULWC1($f4, $9, 0x04)                                            \
1790        "pmullh     $f6,    $f6,    %[ff_pw_9]      \n\t" /* 0,9,9,0*/      \
1791        "pmullh     $f8,    $f8,    %[ff_pw_9]      \n\t" /* 0,9,9,0*/      \
1792        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1793        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1794        "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1795        "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1796        PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1797        MMI_ULWC1($f2, $9, 0x00)                                            \
1798        MMI_ULWC1($f4, $9, 0x04)                                            \
1799        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1800        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1801        "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1802        "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1803        "li         $8,     0x04                    \n\t"                   \
1804        "mtc1       $8,     $f16                    \n\t"                   \
1805        NORMALIZE_MMI("$f16")                                               \
1806        "packushb   $f6,    $f6,    $f8             \n\t"                   \
1807        OP((%[dst]), $f6)                                                   \
1808        "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1809        "addiu      $10,    $10,   -0x01            \n\t"                   \
1810        PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1811        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1812        "bnez       $10,    1b                      \n\t"                   \
1813        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1814          [src]"+r"(src),               [dst]"+r"(dst)                      \
1815        : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1816          [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1817          [stride1]"r"(stride-offset),                                      \
1818          [ff_pw_9]"f"(ff_pw_9.f)                                           \
1819        : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1820          "$f14", "$f16", "memory"                                          \
1821    );                                                                      \
1822}
1823
1824VC1_SHIFT2(OP_PUT, put_)
1825VC1_SHIFT2(OP_AVG, avg_)
1826
1827/**
1828 * Core of the 1/4 and 3/4 shift bicubic interpolation.
1829 *
1830 * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1831 * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1832 * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1833 * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1834 * @param A2      Stride address of 2nd tap
1835 * @param A3      Stride address of 3rd tap
1836 * @param A4      Stride address of 4th tap
1837 */
1838#define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1839    PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1840    LOAD($f2, $9, M*0)                                                      \
1841    LOAD($f4, $9, M*4)                                                      \
1842    UNPACK("$f2")                                                           \
1843    UNPACK("$f4")                                                           \
1844    "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1845    "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1846    PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1847    LOAD($f6, $9, M*0)                                                      \
1848    LOAD($f8, $9, M*4)                                                      \
1849    UNPACK("$f6")                                                           \
1850    UNPACK("$f8")                                                           \
1851    "pmullh     $f6,    $f6,    %[ff_pw_18]     \n\t" /* *18 */             \
1852    "pmullh     $f8,    $f8,    %[ff_pw_18]     \n\t" /* *18 */             \
1853    "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1854    "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1855    PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1856    LOAD($f2, $9, M*0)                                                      \
1857    LOAD($f4, $9, M*4)                                                      \
1858    UNPACK("$f2")                                                           \
1859    UNPACK("$f4")                                                           \
1860    "li         $8,     0x02                    \n\t"                       \
1861    "mtc1       $8,     $f16                    \n\t"                       \
1862    "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1863    "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1864    "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1865    "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1866    PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1867    LOAD($f2, $9, M*0)                                                      \
1868    LOAD($f4, $9, M*4)                                                      \
1869    UNPACK("$f2")                                                           \
1870    UNPACK("$f4")                                                           \
1871    "pmullh     $f2,    $f2,    %[ff_pw_53]     \n\t" /* *53 */             \
1872    "pmullh     $f4,    $f4,    %[ff_pw_53]     \n\t" /* *53 */             \
1873    "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1874    "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1875
1876/**
1877 * Macro to build the vertical 16bits version of vc1_put_shift[13].
1878 * Here, offset=src_stride. Parameters passed A1 to A4 must use
1879 * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1880 *
1881 * @param  NAME   Either 1 or 3
1882 * @see MSPEL_FILTER13_CORE for information on A1->A4
1883 */
1884#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1885static void                                                                 \
1886vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1887                                 mips_reg src_stride,                       \
1888                                 int rnd, int64_t shift)                    \
1889{                                                                           \
1890    int h = 8;                                                              \
1891    union mmi_intfloat64 shift_u;                                           \
1892    DECLARE_VAR_LOW32;                                                      \
1893    DECLARE_VAR_ADDRT;                                                      \
1894    shift_u.i = shift;                                                      \
1895                                                                            \
1896    src -= src_stride;                                                      \
1897                                                                            \
1898    __asm__ volatile(                                                       \
1899        "pxor       $f0,    $f0,    $f0             \n\t"                   \
1900        LOAD_ROUNDER_MMI("%[rnd]")                                          \
1901        ".p2align 3                                 \n\t"                   \
1902        "1:                                         \n\t"                   \
1903        MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1904        NORMALIZE_MMI("%[shift]")                                           \
1905        TRANSFER_DONT_PACK(OP_PUT)                                          \
1906        /* Last 3 (in fact 4) bytes on the line */                          \
1907        PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1908        MMI_ULWC1($f2, $9, 0x08)                                            \
1909        DO_UNPACK("$f2")                                                    \
1910        "mov.d      $f6,    $f2                     \n\t"                   \
1911        "paddh      $f2,    $f2,    $f2             \n\t"                   \
1912        "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1913        PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1914        MMI_ULWC1($f6, $9, 0x08)                                            \
1915        DO_UNPACK("$f6")                                                    \
1916        "pmullh     $f6,    $f6,    %[ff_pw_18]     \n\t" /* *18 */         \
1917        "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1918        PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1919        MMI_ULWC1($f2, $9, 0x08)                                            \
1920        DO_UNPACK("$f2")                                                    \
1921        "pmullh     $f2,    $f2,    %[ff_pw_53]     \n\t" /* *53 */         \
1922        "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1923        PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1924        MMI_ULWC1($f2, $9, 0x08)                                            \
1925        DO_UNPACK("$f2")                                                    \
1926        "li         $8,     0x02                    \n\t"                   \
1927        "mtc1       $8,     $f16                    \n\t"                   \
1928        "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1929        "psubh      $f6,    $f6,    $f2             \n\t"                   \
1930        "paddh      $f6,    $f6,    $f14            \n\t"                   \
1931        "li         $8,     0x06                    \n\t"                   \
1932        "mtc1       $8,     $f16                    \n\t"                   \
1933        "psrah      $f6,    $f6,    $f16            \n\t"                   \
1934        "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1935        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1936        PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1937        PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1938        "bnez       %[h],   1b                      \n\t"                   \
1939        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1940          [h]"+r"(h),                                                       \
1941          [src]"+r"(src),               [dst]"+r"(dst)                      \
1942        : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1943          [stride_x3]"r"(3*src_stride),                                     \
1944          [rnd]"m"(rnd),                [shift]"f"(shift_u.f),              \
1945          [ff_pw_53]"f"(ff_pw_53.f),    [ff_pw_18]"f"(ff_pw_18.f),          \
1946          [ff_pw_3]"f"(ff_pw_3.f)                                           \
1947        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8",                    \
1948          "$f14", "$f16", "memory"                                          \
1949    );                                                                      \
1950}
1951
1952/**
1953 * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1954 * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1955 *
1956 * @param  NAME   Either 1 or 3
1957 * @see MSPEL_FILTER13_CORE for information on A1->A4
1958 */
1959#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
1960static void                                                                 \
1961OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
1962                                       const int16_t *src, int rnd)         \
1963{                                                                           \
1964    int h = 8;                                                              \
1965    DECLARE_VAR_ALL64;                                                      \
1966    DECLARE_VAR_ADDRT;                                                      \
1967                                                                            \
1968    src -= 1;                                                               \
1969    rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
1970                                                                            \
1971    __asm__ volatile(                                                       \
1972        "pxor       $f0,    $f0,    $f0             \n\t"                   \
1973        LOAD_ROUNDER_MMI("%[rnd]")                                          \
1974        ".p2align 3                                 \n\t"                   \
1975        "1:                                         \n\t"                   \
1976        MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
1977        "li         $8,     0x07                    \n\t"                   \
1978        "mtc1       $8,     $f16                    \n\t"                   \
1979        NORMALIZE_MMI("$f16")                                               \
1980        /* Remove bias */                                                   \
1981        "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1982        "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1983        TRANSFER_DO_PACK(OP)                                                \
1984        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1985        PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
1986        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1987        "bnez       %[h],   1b                      \n\t"                   \
1988        : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1989          [h]"+r"(h),                                                       \
1990          [src]"+r"(src),               [dst]"+r"(dst)                      \
1991        : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1992          [ff_pw_53]"f"(ff_pw_53.f),    [ff_pw_18]"f"(ff_pw_18.f),          \
1993          [ff_pw_3]"f"(ff_pw_3.f),      [ff_pw_128]"f"(ff_pw_128.f)         \
1994        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8",                    \
1995          "$f14", "$f16", "memory"                                          \
1996    );                                                                      \
1997}
1998
1999/**
2000 * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2001 * Here, offset=src_stride. Parameters passed A1 to A4 must use
2002 * %3 (offset), %4 (2*offset) and %5 (3*offset).
2003 *
2004 * @param  NAME   Either 1 or 3
2005 * @see MSPEL_FILTER13_CORE for information on A1->A4
2006 */
2007#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
2008static void                                                                 \
2009OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
2010                              mips_reg stride, int rnd, mips_reg offset)    \
2011{                                                                           \
2012    int h = 8;                                                              \
2013    DECLARE_VAR_LOW32;                                                      \
2014    DECLARE_VAR_ADDRT;                                                      \
2015                                                                            \
2016    src -= offset;                                                          \
2017    rnd = 32-rnd;                                                           \
2018                                                                            \
2019    __asm__ volatile (                                                      \
2020        "pxor       $f0,    $f0,    $f0             \n\t"                   \
2021        LOAD_ROUNDER_MMI("%[rnd]")                                          \
2022        ".p2align 3                                 \n\t"                   \
2023        "1:                                         \n\t"                   \
2024        MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
2025        "li         $8,     0x06                    \n\t"                   \
2026        "mtc1       $8,     $f16                    \n\t"                   \
2027        NORMALIZE_MMI("$f16")                                               \
2028        TRANSFER_DO_PACK(OP)                                                \
2029        "addiu      %[h],   %[h],      -0x01        \n\t"                   \
2030        PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
2031        PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
2032        "bnez       %[h],   1b                      \n\t"                   \
2033        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
2034          [h]"+r"(h),                                                       \
2035          [src]"+r"(src),               [dst]"+r"(dst)                      \
2036        : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
2037          [offset_x3]"r"(3*offset),     [stride]"r"(stride),                \
2038          [rnd]"m"(rnd),                                                    \
2039          [ff_pw_53]"f"(ff_pw_53.f),    [ff_pw_18]"f"(ff_pw_18.f),          \
2040          [ff_pw_3]"f"(ff_pw_3.f)                                           \
2041        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8",                    \
2042          "$f14", "$f16", "memory"                                          \
2043    );                                                                      \
2044}
2045
2046
2047/** 1/4 shift bicubic interpolation */
2048MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2049MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2050MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2051MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2052MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2053
2054/** 3/4 shift bicubic interpolation */
2055MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2056MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2057MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2058MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2059MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2060
2061typedef void (*vc1_mspel_mc_filter_ver_16bits)
2062             (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2063              int64_t shift);
2064typedef void (*vc1_mspel_mc_filter_hor_16bits)
2065             (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2066typedef void (*vc1_mspel_mc_filter_8bits)
2067             (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2068              mips_reg offset);
2069
2070/**
2071 * Interpolate fractional pel values by applying proper vertical then
2072 * horizontal filter.
2073 *
2074 * @param  dst     Destination buffer for interpolated pels.
2075 * @param  src     Source buffer.
2076 * @param  stride  Stride for both src and dst buffers.
2077 * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
2078 * @param  hmode   Vertical filter.
2079 * @param  rnd     Rounding bias.
2080 */
2081#define VC1_MSPEL_MC(OP)                                                    \
2082static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2083                               int hmode, int vmode, int rnd)               \
2084{                                                                           \
2085    static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2086         { NULL, vc1_put_ver_16b_shift1_mmi,                                \
2087                 vc1_put_ver_16b_shift2_mmi,                                \
2088                 vc1_put_ver_16b_shift3_mmi };                              \
2089    static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2090         { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
2091                 OP ## vc1_hor_16b_shift2_mmi,                              \
2092                 OP ## vc1_hor_16b_shift3_mmi };                            \
2093    static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
2094         { NULL, OP ## vc1_shift1_mmi,                                      \
2095                 OP ## vc1_shift2_mmi,                                      \
2096                 OP ## vc1_shift3_mmi };                                    \
2097                                                                            \
2098    if (vmode) { /* Vertical filter to apply */                             \
2099        if (hmode) { /* Horizontal filter to apply, output to tmp */        \
2100            static const int shift_value[] = { 0, 5, 1, 5 };                \
2101            int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
2102            int    r;                                                       \
2103            LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
2104                                                                            \
2105            r = (1<<(shift-1)) + rnd-1;                                     \
2106            vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
2107                                                                            \
2108            vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
2109            return;                                                         \
2110        }                                                                   \
2111        else { /* No horizontal filter, output 8 lines to dst */            \
2112            vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
2113            return;                                                         \
2114        }                                                                   \
2115    }                                                                       \
2116                                                                            \
2117    /* Horizontal mode with no vertical mode */                             \
2118    vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
2119}                                                                           \
2120static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
2121                                  int stride, int hmode, int vmode, int rnd)\
2122{                                                                           \
2123    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2124    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2125    dst += 8*stride; src += 8*stride;                                       \
2126    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2127    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2128}
2129
2130VC1_MSPEL_MC(put_)
2131VC1_MSPEL_MC(avg_)
2132
2133/** Macro to ease bicubic filter interpolation functions declarations */
2134#define DECLARE_FUNCTION(a, b)                                              \
2135void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2136                                           const uint8_t *src,              \
2137                                           ptrdiff_t stride,                \
2138                                           int rnd)                         \
2139{                                                                           \
2140     put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2141}                                                                           \
2142void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2143                                           const uint8_t *src,              \
2144                                           ptrdiff_t stride,                \
2145                                           int rnd)                         \
2146{                                                                           \
2147     avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2148}                                                                           \
2149void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2150                                              const uint8_t *src,           \
2151                                              ptrdiff_t stride,             \
2152                                              int rnd)                      \
2153{                                                                           \
2154     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2155}                                                                           \
2156void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2157                                              const uint8_t *src,           \
2158                                              ptrdiff_t stride,             \
2159                                              int rnd)                      \
2160{                                                                           \
2161     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2162}
2163
2164DECLARE_FUNCTION(0, 1)
2165DECLARE_FUNCTION(0, 2)
2166DECLARE_FUNCTION(0, 3)
2167
2168DECLARE_FUNCTION(1, 0)
2169DECLARE_FUNCTION(1, 1)
2170DECLARE_FUNCTION(1, 2)
2171DECLARE_FUNCTION(1, 3)
2172
2173DECLARE_FUNCTION(2, 0)
2174DECLARE_FUNCTION(2, 1)
2175DECLARE_FUNCTION(2, 2)
2176DECLARE_FUNCTION(2, 3)
2177
2178DECLARE_FUNCTION(3, 0)
2179DECLARE_FUNCTION(3, 1)
2180DECLARE_FUNCTION(3, 2)
2181DECLARE_FUNCTION(3, 3)
2182
2183#define CHROMA_MC_8_MMI                                                     \
2184        "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
2185        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2186        "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
2187        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2188        "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
2189        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2190        "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
2191        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2192                                                                            \
2193        "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2194        "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
2195        "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2196        "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
2197        "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2198        "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
2199        "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2200        "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
2201                                                                            \
2202        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2203        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2204        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2205        "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2206                                                                            \
2207        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
2208        "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
2209        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
2210        "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
2211                                                                            \
2212        "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
2213        "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
2214        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
2215
2216
2217#define CHROMA_MC_4_MMI                                                     \
2218        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2219        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2220        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2221        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2222                                                                            \
2223        "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2224        "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2225        "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2226        "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2227                                                                            \
2228        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2229        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2230        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2231        "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2232                                                                            \
2233        "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
2234        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
2235
2236
2237void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2238                                      uint8_t *src /* align 1 */,
2239                                      ptrdiff_t stride, int h, int x, int y)
2240{
2241    union mmi_intfloat64 A, B, C, D;
2242    double ftmp[10];
2243    uint32_t tmp[1];
2244    DECLARE_VAR_ALL64;
2245    DECLARE_VAR_ADDRT;
2246    A.i = (8 - x) * (8 - y);
2247    B.i =     (x) * (8 - y);
2248    C.i = (8 - x) *     (y);
2249    D.i =     (x) *     (y);
2250
2251    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2252
2253    __asm__ volatile(
2254        "li         %[tmp0],    0x06                                    \n\t"
2255        "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2256        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2257        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2258        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2259        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2260        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2261
2262        "1:                                                             \n\t"
2263        MMI_ULDC1(%[ftmp1], %[src], 0x00)
2264        MMI_ULDC1(%[ftmp2], %[src], 0x01)
2265        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2266        MMI_ULDC1(%[ftmp3], %[src], 0x00)
2267        MMI_ULDC1(%[ftmp4], %[src], 0x01)
2268
2269        CHROMA_MC_8_MMI
2270
2271        MMI_SDC1(%[ftmp1], %[dst], 0x00)
2272        "addiu      %[h],       %[h],      -0x01                        \n\t"
2273        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2274        "bnez       %[h],       1b                                      \n\t"
2275        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2276          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2277          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2278          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2279          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2280          RESTRICT_ASM_ALL64
2281          RESTRICT_ASM_ADDRT
2282          [tmp0]"=&r"(tmp[0]),
2283          [src]"+&r"(src),              [dst]"+&r"(dst),
2284          [h]"+&r"(h)
2285        : [stride]"r"((mips_reg)stride),
2286          [A]"f"(A.f),                  [B]"f"(B.f),
2287          [C]"f"(C.f),                  [D]"f"(D.f),
2288          [ff_pw_28]"f"(ff_pw_28.f)
2289        : "memory"
2290    );
2291}
2292
2293void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2294                                      uint8_t *src /* align 1 */,
2295                                      ptrdiff_t stride, int h, int x, int y)
2296{
2297    union mmi_intfloat64 A, B, C, D;
2298    double ftmp[6];
2299    uint32_t tmp[1];
2300    DECLARE_VAR_LOW32;
2301    DECLARE_VAR_ADDRT;
2302    A.i = (8 - x) * (8 - y);
2303    B.i =     (x) * (8 - y);
2304    C.i = (8 - x) *     (y);
2305    D.i =     (x) *     (y);
2306
2307    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2308
2309    __asm__ volatile(
2310        "li         %[tmp0],    0x06                                    \n\t"
2311        "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2312        "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2313        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2314        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2315        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2316        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2317
2318        "1:                                                             \n\t"
2319        MMI_ULWC1(%[ftmp1], %[src], 0x00)
2320        MMI_ULWC1(%[ftmp2], %[src], 0x01)
2321        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2322        MMI_ULWC1(%[ftmp3], %[src], 0x00)
2323        MMI_ULWC1(%[ftmp4], %[src], 0x01)
2324
2325        CHROMA_MC_4_MMI
2326
2327        MMI_SWC1(%[ftmp1], %[dst], 0x00)
2328        "addiu      %[h],       %[h],      -0x01                        \n\t"
2329        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2330        "bnez       %[h],       1b                                      \n\t"
2331        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2332          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2333          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2334          [tmp0]"=&r"(tmp[0]),
2335          RESTRICT_ASM_LOW32
2336          RESTRICT_ASM_ADDRT
2337          [src]"+&r"(src),              [dst]"+&r"(dst),
2338          [h]"+&r"(h)
2339        : [stride]"r"((mips_reg)stride),
2340          [A]"f"(A.f),                  [B]"f"(B.f),
2341          [C]"f"(C.f),                  [D]"f"(D.f),
2342          [ff_pw_28]"f"(ff_pw_28.f)
2343        : "memory"
2344    );
2345}
2346
2347void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2348                                      uint8_t *src /* align 1 */,
2349                                      ptrdiff_t stride, int h, int x, int y)
2350{
2351    union mmi_intfloat64 A, B, C, D;
2352    double ftmp[10];
2353    uint32_t tmp[1];
2354    DECLARE_VAR_ALL64;
2355    DECLARE_VAR_ADDRT;
2356    A.i = (8 - x) * (8 - y);
2357    B.i =     (x) * (8 - y);
2358    C.i = (8 - x) *     (y);
2359    D.i =     (x) *     (y);
2360
2361    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2362
2363    __asm__ volatile(
2364        "li         %[tmp0],    0x06                                    \n\t"
2365        "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2366        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2367        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2368        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2369        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2370        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2371
2372        "1:                                                             \n\t"
2373        MMI_ULDC1(%[ftmp1], %[src], 0x00)
2374        MMI_ULDC1(%[ftmp2], %[src], 0x01)
2375        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2376        MMI_ULDC1(%[ftmp3], %[src], 0x00)
2377        MMI_ULDC1(%[ftmp4], %[src], 0x01)
2378
2379        CHROMA_MC_8_MMI
2380
2381        MMI_LDC1(%[ftmp2], %[dst], 0x00)
2382        "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2383
2384        MMI_SDC1(%[ftmp1], %[dst], 0x00)
2385        "addiu      %[h],       %[h],      -0x01                        \n\t"
2386        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2387        "bnez       %[h],       1b                                      \n\t"
2388        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2389          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2390          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2391          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2392          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2393          [tmp0]"=&r"(tmp[0]),
2394          RESTRICT_ASM_ALL64
2395          RESTRICT_ASM_ADDRT
2396          [src]"+&r"(src),              [dst]"+&r"(dst),
2397          [h]"+&r"(h)
2398        : [stride]"r"((mips_reg)stride),
2399          [A]"f"(A.f),                 [B]"f"(B.f),
2400          [C]"f"(C.f),                 [D]"f"(D.f),
2401          [ff_pw_28]"f"(ff_pw_28.f)
2402        : "memory"
2403    );
2404}
2405
2406void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2407                                      uint8_t *src /* align 1 */,
2408                                      ptrdiff_t stride, int h, int x, int y)
2409{
2410    union mmi_intfloat64 A, B, C, D;
2411    double ftmp[6];
2412    uint32_t tmp[1];
2413    DECLARE_VAR_LOW32;
2414    DECLARE_VAR_ADDRT;
2415    A.i = (8 - x) * (8 - y);
2416    B.i = (x) * (8 - y);
2417    C.i = (8 - x) * (y);
2418    D.i = (x) * (y);
2419
2420    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2421
2422    __asm__ volatile(
2423        "li         %[tmp0],    0x06                                    \n\t"
2424        "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2425        "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2426        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2427        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2428        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2429        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2430
2431        "1:                                                             \n\t"
2432        MMI_ULWC1(%[ftmp1], %[src], 0x00)
2433        MMI_ULWC1(%[ftmp2], %[src], 0x01)
2434        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2435        MMI_ULWC1(%[ftmp3], %[src], 0x00)
2436        MMI_ULWC1(%[ftmp4], %[src], 0x01)
2437
2438        CHROMA_MC_4_MMI
2439
2440        MMI_LWC1(%[ftmp2], %[dst], 0x00)
2441        "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2442
2443        MMI_SWC1(%[ftmp1], %[dst], 0x00)
2444        "addiu      %[h],       %[h],      -0x01                        \n\t"
2445        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2446        "bnez       %[h],       1b                                      \n\t"
2447        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2448          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2449          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2450          [tmp0]"=&r"(tmp[0]),
2451          RESTRICT_ASM_LOW32
2452          RESTRICT_ASM_ADDRT
2453          [src]"+&r"(src),              [dst]"+&r"(dst),
2454          [h]"+&r"(h)
2455        : [stride]"r"((mips_reg)stride),
2456          [A]"f"(A.f),                  [B]"f"(B.f),
2457          [C]"f"(C.f),                  [D]"f"(D.f),
2458          [ff_pw_28]"f"(ff_pw_28.f)
2459        : "memory"
2460    );
2461}
2462