1/*
2 * Loongson SIMD optimized vp8dsp
3 *
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "vp8dsp_mips.h"
25#include "constants.h"
26#include "libavutil/attributes.h"
27#include "libavutil/mips/mmiutils.h"
28#include "libavutil/mem_internal.h"
29
30#define DECLARE_DOUBLE_1            double db_1
31#define DECLARE_DOUBLE_2            double db_2
32#define DECLARE_UINT32_T            uint32_t  it_1
33#define RESTRICT_ASM_DOUBLE_1       [db_1]"=&f"(db_1)
34#define RESTRICT_ASM_DOUBLE_2       [db_2]"=&f"(db_2)
35#define RESTRICT_ASM_UINT32_T       [it_1]"=&r"(it_1)
36
37#define MMI_PCMPGTUB(dst, src1, src2)                                       \
38        "pcmpeqb    %[db_1],    "#src1",        "#src2"             \n\t"   \
39        "pmaxub     %[db_2],    "#src1",        "#src2"             \n\t"   \
40        "pcmpeqb    %[db_2],    %[db_2],        "#src1"             \n\t"   \
41        "pxor       "#dst",     %[db_2],        %[db_1]             \n\t"
42
43#define MMI_BTOH(dst_l, dst_r, src)                                         \
44        "pxor       %[db_1],    %[db_1],        %[db_1]             \n\t"   \
45        "pcmpgtb    %[db_2],    %[db_1],        "#src"              \n\t"   \
46        "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
47        "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
48
49#define MMI_VP8_LOOP_FILTER                                                 \
50        /* Calculation of hev */                                            \
51        "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
52        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
53        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
54        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
55        "pasubub    %[ftmp0],   %[p1],          %[p0]               \n\t"   \
56        "pasubub    %[ftmp1],   %[q1],          %[q0]               \n\t"   \
57        "pmaxub     %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"   \
58        MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3])                            \
59        /* Calculation of mask */                                           \
60        "pasubub    %[ftmp1],   %[p0],          %[q0]               \n\t"   \
61        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
62        "pasubub    %[ftmp2],   %[p1],          %[q1]               \n\t"   \
63        "li         %[tmp0],    0x09                                \n\t"   \
64        "dmtc1      %[tmp0],    %[ftmp3]                            \n\t"   \
65        PSRLB_MMI(%[ftmp2],  %[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp2])     \
66        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
67        "dmtc1      %[e],       %[ftmp3]                            \n\t"   \
68        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
69        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
70        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
71        MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3])                           \
72        "pmaxub     %[mask],    %[mask],        %[ftmp0]            \n\t"   \
73        "pasubub    %[ftmp1],   %[p3],          %[p2]               \n\t"   \
74        "pasubub    %[ftmp2],   %[p2],          %[p1]               \n\t"   \
75        "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
76        "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
77        "pasubub    %[ftmp1],   %[q3],          %[q2]               \n\t"   \
78        "pasubub    %[ftmp2],   %[q2],          %[q1]               \n\t"   \
79        "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
80        "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
81        "dmtc1      %[i],       %[ftmp3]                            \n\t"   \
82        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
83        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
84        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
85        MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3])                            \
86        "pcmpeqw    %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
87        "pxor       %[mask],    %[mask],        %[ftmp3]            \n\t"   \
88        /* VP8_MBFILTER */                                                  \
89        "li         %[tmp0],    0x80808080                          \n\t"   \
90        "dmtc1      %[tmp0],    %[ftmp7]                            \n\t"   \
91        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"   \
92        "pxor       %[p2],      %[p2],          %[ftmp7]            \n\t"   \
93        "pxor       %[p1],      %[p1],          %[ftmp7]            \n\t"   \
94        "pxor       %[p0],      %[p0],          %[ftmp7]            \n\t"   \
95        "pxor       %[q0],      %[q0],          %[ftmp7]            \n\t"   \
96        "pxor       %[q1],      %[q1],          %[ftmp7]            \n\t"   \
97        "pxor       %[q2],      %[q2],          %[ftmp7]            \n\t"   \
98        "psubsb     %[ftmp4],   %[p1],          %[q1]               \n\t"   \
99        "psubb      %[ftmp5],   %[q0],          %[p0]               \n\t"   \
100        MMI_BTOH(%[ftmp1],  %[ftmp0],  %[ftmp5])                            \
101        MMI_BTOH(%[ftmp3],  %[ftmp2],  %[ftmp4])                            \
102        /* Right part */                                                    \
103        "paddh      %[ftmp5],   %[ftmp0],       %[ftmp0]            \n\t"   \
104        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"   \
105        "paddh      %[ftmp0],   %[ftmp2],       %[ftmp0]            \n\t"   \
106        /* Left part */                                                     \
107        "paddh      %[ftmp5],   %[ftmp1],       %[ftmp1]            \n\t"   \
108        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"   \
109        "paddh      %[ftmp1],   %[ftmp3],       %[ftmp1]            \n\t"   \
110        /* Combine left and right part */                                   \
111        "packsshb   %[ftmp1],   %[ftmp0],       %[ftmp1]            \n\t"   \
112        "pand       %[ftmp1],   %[ftmp1],       %[mask]             \n\t"   \
113        "pand       %[ftmp2],   %[ftmp1],       %[hev]              \n\t"   \
114        "li         %[tmp0],    0x04040404                          \n\t"   \
115        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
116        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
117        "paddsb     %[ftmp3],   %[ftmp2],       %[ftmp0]            \n\t"   \
118        "li         %[tmp0],    0x0B                                \n\t"   \
119        "dmtc1      %[tmp0],    %[ftmp4]                            \n\t"   \
120        PSRAB_MMI(%[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp6],  %[ftmp3])     \
121        "li         %[tmp0],    0x03030303                          \n\t"   \
122        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
123        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
124        "paddsb     %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"   \
125        "li         %[tmp0],    0x0B                                \n\t"   \
126        "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
127        PSRAB_MMI(%[ftmp4],  %[ftmp2],  %[ftmp5],  %[ftmp6],  %[ftmp4])     \
128        "psubsb     %[q0],      %[q0],          %[ftmp3]            \n\t"   \
129        "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
130        /* filt_val &= ~hev */                                              \
131        "pcmpeqw    %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
132        "pxor       %[hev],     %[hev],         %[ftmp0]            \n\t"   \
133        "pand       %[ftmp1],   %[ftmp1],       %[hev]              \n\t"   \
134        MMI_BTOH(%[ftmp5],  %[ftmp6],  %[ftmp1])                            \
135        "li         %[tmp0],    0x07                                \n\t"   \
136        "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
137        "li         %[tmp0],    0x001b001b                          \n\t"   \
138        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
139        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
140        "li         %[tmp0],    0x003f003f                          \n\t"   \
141        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
142        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
143        /* Right part */                                                    \
144        "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
145        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
146        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
147        /* Left part */                                                     \
148        "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
149        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
150        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
151        /* Combine left and right part */                                   \
152        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
153        "psubsb     %[q0],      %[q0],          %[ftmp4]            \n\t"   \
154        "pxor       %[q0],      %[q0],          %[ftmp7]            \n\t"   \
155        "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
156        "pxor       %[p0],      %[p0],          %[ftmp7]            \n\t"   \
157        "li         %[tmp0],    0x00120012                          \n\t"   \
158        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
159        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
160        /* Right part */                                                    \
161        "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
162        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
163        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
164        /* Left part */                                                     \
165        "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
166        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
167        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
168        /* Combine left and right part */                                   \
169        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
170        "psubsb     %[q1],      %[q1],          %[ftmp4]            \n\t"   \
171        "pxor       %[q1],      %[q1],          %[ftmp7]            \n\t"   \
172        "paddsb     %[p1],      %[p1],          %[ftmp4]            \n\t"   \
173        "pxor       %[p1],      %[p1],          %[ftmp7]            \n\t"   \
174        "li         %[tmp0],    0x03                                \n\t"   \
175        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
176        /* Right part */                                                    \
177        "psllh      %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
178        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"   \
179        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
180        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
181        /* Left part */                                                     \
182        "psllh      %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
183        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"   \
184        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
185        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
186        /* Combine left and right part */                                   \
187        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
188        "psubsb     %[q2],      %[q2],          %[ftmp4]            \n\t"   \
189        "pxor       %[q2],      %[q2],          %[ftmp7]            \n\t"   \
190        "paddsb     %[p2],      %[p2],          %[ftmp4]            \n\t"   \
191        "pxor       %[p2],      %[p2],          %[ftmp7]            \n\t"
192
193#define PUT_VP8_EPEL4_H6_MMI(src, dst)                                      \
194        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
195        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
196        "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
197                                                                            \
198        MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
199        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
200        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
201        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
202                                                                            \
203        MMI_ULWC1(%[ftmp1], src, -0x02)                                     \
204        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
205        "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
206        "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
207                                                                            \
208        MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
209        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
210        "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
211                                                                            \
212        MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
213        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
214        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
215        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
216                                                                            \
217        MMI_ULWC1(%[ftmp1], src, 0x03)                                      \
218        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
219        "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
220        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
221                                                                            \
222        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
223        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
224        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
225        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
226                                                                            \
227        MMI_SWC1(%[ftmp1], dst, 0x00)
228
229
230#define PUT_VP8_EPEL4_H4_MMI(src, dst)                                      \
231        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
232        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
233        "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
234                                                                            \
235        MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
236        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
237        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
238        "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
239                                                                            \
240        MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
241        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
242        "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
243                                                                            \
244        MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
245        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
246        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
247        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
248                                                                            \
249        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
250                                                                            \
251        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
252        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
253                                                                            \
254        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
255        MMI_SWC1(%[ftmp1], dst, 0x00)
256
257
258#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)                     \
259        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
260        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
261        "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
262                                                                            \
263        PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
264        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
265        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
266        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
267        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
268                                                                            \
269        PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
270        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
271        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
272        "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
273        "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
274                                                                            \
275        PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
276        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
277        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
278        "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
279                                                                            \
280        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
281        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
282        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
283        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
284        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
285                                                                            \
286        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
287        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
288        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
289        "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
290        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
291                                                                            \
292        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
293                                                                            \
294        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
295        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
296        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
297                                                                            \
298        MMI_SWC1(%[ftmp1], dst, 0x00)
299
300
301#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)                     \
302        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
303        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
304        "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
305                                                                            \
306        PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
307        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
308        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
309        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
310        "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
311                                                                            \
312        PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
313        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
314        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
315        "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
316                                                                            \
317        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
318        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
319        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
320        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
321        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
322                                                                            \
323        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
324                                                                            \
325        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
326        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
327        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
328                                                                            \
329        MMI_SWC1(%[ftmp1], dst, 0x00)
330
331
332#define PUT_VP8_EPEL8_H6_MMI(src, dst)                                      \
333        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
334        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
335        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
336        "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
337        "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
338                                                                            \
339        MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
340        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
341        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
342        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
343        "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
344        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
345        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
346                                                                            \
347        MMI_ULDC1(%[ftmp1], src, -0x02)                                     \
348        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
349        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
350        "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
351        "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
352        "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
353        "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
354                                                                            \
355        MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
356        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
357        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
358        "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
359        "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
360                                                                            \
361        MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
362        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
363        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
364        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
365        "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
366        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
367        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
368                                                                            \
369        MMI_ULDC1(%[ftmp1], src, 0x03)                                      \
370        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
371        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
372        "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
373        "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
374        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
375        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
376                                                                            \
377        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
378        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
379                                                                            \
380        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
381        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
382        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
383        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
384        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
385                                                                            \
386        MMI_SDC1(%[ftmp1], dst, 0x00)
387
388
389#define PUT_VP8_EPEL8_H4_MMI(src, dst)                                      \
390        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
391        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
392        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
393        "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
394        "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
395                                                                            \
396        MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
397        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
398        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
399        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
400        "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
401        "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
402        "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
403                                                                            \
404        MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
405        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
406        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
407        "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
408        "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
409                                                                            \
410        MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
411        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
412        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
413        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
414        "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
415        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
416        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
417                                                                            \
418        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
419        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
420                                                                            \
421        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
422        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
423        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
424        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
425                                                                            \
426        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
427        MMI_SDC1(%[ftmp1], dst, 0x00)
428
429
430#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)                     \
431        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
432        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
433        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
434        "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
435        "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
436                                                                            \
437        PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
438        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
439        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
440        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
441        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
442        "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
443        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
444        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
445                                                                            \
446        PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
447        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
448        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
449        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
450        "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
451        "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
452        "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
453        "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
454                                                                            \
455        PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
456        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
457        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
458        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
459        "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
460        "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
461                                                                            \
462        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
463        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
464        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
465        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
466        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
467        "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
468        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
469        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
470                                                                            \
471        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
472        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
473        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
474        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
475        "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
476        "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
477        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
478        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
479                                                                            \
480        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
481        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
482                                                                            \
483        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
484        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
485        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
486        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
487        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
488                                                                            \
489        MMI_SDC1(%[ftmp1], dst, 0x00)
490
491
492#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)                     \
493        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
494        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
495        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
496        "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
497        "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
498                                                                            \
499        PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
500        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
501        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
502        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
503        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
504        "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
505        "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
506        "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
507                                                                            \
508        PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
509        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
510        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
511        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
512        "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
513        "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
514                                                                            \
515        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
516        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
517        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
518        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
519        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
520        "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
521        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
522        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
523                                                                            \
524        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
525        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
526                                                                            \
527        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
528        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
529        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
530        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
531        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
532                                                                            \
533        MMI_SDC1(%[ftmp1], dst, 0x00)
534
535
536#define PUT_VP8_BILINEAR8_H_MMI(src, dst)                                   \
537        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
538        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
539        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
540        "pmullh     %[ftmp5],   %[ftmp2],       %[a]                \n\t"   \
541        "pmullh     %[ftmp6],   %[ftmp3],       %[a]                \n\t"   \
542                                                                            \
543        MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
544        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
545        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
546        "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
547        "pmullh     %[ftmp3],   %[ftmp3],       %[b]                \n\t"   \
548        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
549        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
550                                                                            \
551        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
552        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
553        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
554        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
555                                                                            \
556        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
557        MMI_SDC1(%[ftmp1], dst, 0x00)
558
559
560#define PUT_VP8_BILINEAR4_H_MMI(src, dst)                                   \
561        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
562        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
563        "pmullh     %[ftmp3],   %[ftmp2],       %[a]                \n\t"   \
564                                                                            \
565        MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
566        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
567        "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
568        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
569                                                                            \
570        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
571        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
572                                                                            \
573        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
574        MMI_SWC1(%[ftmp1], dst, 0x00)
575
576
577#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)                    \
578        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
579        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
580        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
581        "pmullh     %[ftmp5],   %[ftmp2],       %[c]                \n\t"   \
582        "pmullh     %[ftmp6],   %[ftmp3],       %[c]                \n\t"   \
583                                                                            \
584        PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
585        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
586        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
587        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
588        "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
589        "pmullh     %[ftmp3],   %[ftmp3],       %[d]                \n\t"   \
590        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
591        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
592                                                                            \
593        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
594        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
595        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
596        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
597                                                                            \
598        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
599        MMI_SDC1(%[ftmp1], dst, 0x00)
600
601
602#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)                    \
603        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
604        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
605        "pmullh     %[ftmp3],   %[ftmp2],       %[c]                \n\t"   \
606                                                                            \
607        PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
608        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
609        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
610        "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
611        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
612                                                                            \
613        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
614        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
615                                                                            \
616        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
617        MMI_SWC1(%[ftmp1], dst, 0x00)
618
619
620DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
621   {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
622    0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
623
624   {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
625    0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
626
627   {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
628    0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
629
630   {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
631    0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
632
633   {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
634    0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
635
636   {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
637    0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
638
639   {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
640    0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
641};
642
643#if 0
644#define FILTER_6TAP(src, F, stride)                                           \
645    cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
646        F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] -             \
647        F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
648
649#define FILTER_4TAP(src, F, stride)                                           \
650    cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
651        F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
652
653static const uint8_t subpel_filters[7][6] = {
654    { 0,  6, 123,  12,  1, 0 },
655    { 2, 11, 108,  36,  8, 1 },
656    { 0,  9,  93,  50,  6, 0 },
657    { 3, 16,  77,  77, 16, 3 },
658    { 0,  6,  50,  93,  9, 0 },
659    { 1,  8,  36, 108, 11, 2 },
660    { 0,  1,  12, 123,  6, 0 },
661};
662
663#define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
664#define MUL_35468(a)  (((a) * 35468) >> 16)
665#endif
666
667#define clip_int8(n) (cm[(n) + 0x80] - 0x80)
668static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
669        ptrdiff_t stride)
670{
671    int av_unused p1 = p[-2 * stride];
672    int av_unused p0 = p[-1 * stride];
673    int av_unused q0 = p[ 0 * stride];
674    int av_unused q1 = p[ 1 * stride];
675    int a, f1, f2;
676    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
677
678    a = 3 * (q0 - p0);
679    a += clip_int8(p1 - q1);
680    a = clip_int8(a);
681
682    // We deviate from the spec here with c(a+3) >> 3
683    // since that's what libvpx does.
684    f1 = FFMIN(a + 4, 127) >> 3;
685    f2 = FFMIN(a + 3, 127) >> 3;
686
687    // Despite what the spec says, we do need to clamp here to
688    // be bitexact with libvpx.
689    p[-1 * stride] = cm[p0 + f2];
690    p[ 0 * stride] = cm[q0 - f1];
691}
692
693static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
694        ptrdiff_t stride)
695{
696    int av_unused p1 = p[-2 * stride];
697    int av_unused p0 = p[-1 * stride];
698    int av_unused q0 = p[ 0 * stride];
699    int av_unused q1 = p[ 1 * stride];
700    int a, f1, f2;
701    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
702
703    a = 3 * (q0 - p0);
704    a = clip_int8(a);
705
706    // We deviate from the spec here with c(a+3) >> 3
707    // since that's what libvpx does.
708    f1 = FFMIN(a + 4, 127) >> 3;
709    f2 = FFMIN(a + 3, 127) >> 3;
710
711    // Despite what the spec says, we do need to clamp here to
712    // be bitexact with libvpx.
713    p[-1 * stride] = cm[p0 + f2];
714    p[ 0 * stride] = cm[q0 - f1];
715    a              = (f1 + 1) >> 1;
716    p[-2 * stride] = cm[p1 + a];
717    p[ 1 * stride] = cm[q1 - a];
718}
719
720static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
721        int flim)
722{
723    int av_unused p1 = p[-2 * stride];
724    int av_unused p0 = p[-1 * stride];
725    int av_unused q0 = p[ 0 * stride];
726    int av_unused q1 = p[ 1 * stride];
727
728    return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
729}
730
731static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
732{
733    int av_unused p1 = p[-2 * stride];
734    int av_unused p0 = p[-1 * stride];
735    int av_unused q0 = p[ 0 * stride];
736    int av_unused q1 = p[ 1 * stride];
737
738    return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
739}
740
741static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
742{
743    int a0, a1, a2, w;
744    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
745
746    int av_unused p2 = p[-3 * stride];
747    int av_unused p1 = p[-2 * stride];
748    int av_unused p0 = p[-1 * stride];
749    int av_unused q0 = p[ 0 * stride];
750    int av_unused q1 = p[ 1 * stride];
751    int av_unused q2 = p[ 2 * stride];
752
753    w = clip_int8(p1 - q1);
754    w = clip_int8(w + 3 * (q0 - p0));
755
756    a0 = (27 * w + 63) >> 7;
757    a1 = (18 * w + 63) >> 7;
758    a2 =  (9 * w + 63) >> 7;
759
760    p[-3 * stride] = cm[p2 + a2];
761    p[-2 * stride] = cm[p1 + a1];
762    p[-1 * stride] = cm[p0 + a0];
763    p[ 0 * stride] = cm[q0 - a0];
764    p[ 1 * stride] = cm[q1 - a1];
765    p[ 2 * stride] = cm[q2 - a2];
766}
767
768static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
769        int E, int I)
770{
771    int av_unused p3 = p[-4 * stride];
772    int av_unused p2 = p[-3 * stride];
773    int av_unused p1 = p[-2 * stride];
774    int av_unused p0 = p[-1 * stride];
775    int av_unused q0 = p[ 0 * stride];
776    int av_unused q1 = p[ 1 * stride];
777    int av_unused q2 = p[ 2 * stride];
778    int av_unused q3 = p[ 3 * stride];
779
780    return vp8_simple_limit(p, stride, E) &&
781           FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
782           FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
783           FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
784}
785
786static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
787        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
788{
789    double ftmp[18];
790    uint32_t tmp[1];
791    DECLARE_DOUBLE_1;
792    DECLARE_DOUBLE_2;
793    DECLARE_UINT32_T;
794    DECLARE_VAR_ALL64;
795
796    __asm__ volatile(
797        /* Get data from dst */
798        MMI_ULDC1(%[q0], %[dst], 0x0)
799        PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
800        MMI_ULDC1(%[p0], %[tmp0], 0x0)
801        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
802        MMI_ULDC1(%[p1], %[tmp0], 0x0)
803        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
804        MMI_ULDC1(%[p2], %[tmp0], 0x0)
805        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
806        MMI_ULDC1(%[p3], %[tmp0], 0x0)
807        PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
808        MMI_ULDC1(%[q1], %[tmp0], 0x0)
809        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
810        MMI_ULDC1(%[q2], %[tmp0], 0x0)
811        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
812        MMI_ULDC1(%[q3], %[tmp0], 0x0)
813        MMI_VP8_LOOP_FILTER
814        /* Move to dst */
815        MMI_USDC1(%[q0], %[dst], 0x0)
816        PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
817        MMI_USDC1(%[p0], %[tmp0], 0x0)
818        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
819        MMI_USDC1(%[p1], %[tmp0], 0x0)
820        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
821        MMI_USDC1(%[p2], %[tmp0], 0x0)
822        PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
823        MMI_USDC1(%[q1], %[tmp0], 0x0)
824        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
825        MMI_USDC1(%[q2], %[tmp0], 0x0)
826        : RESTRICT_ASM_ALL64
827          [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
828          [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
829          [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
830          [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
831          [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
832          [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
833          [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
834          [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
835          [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
836          [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
837          RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
838          RESTRICT_ASM_UINT32_T
839        : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
840          [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
841        : "memory"
842    );
843}
844
845static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
846        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
847{
848    int i;
849
850    for (i = 0; i < 8; i++)
851        if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
852            int hv = hev(dst + i * 1, stride, hev_thresh);
853            if (hv)
854                vp8_filter_common_is4tap(dst + i * 1, stride);
855            else
856                vp8_filter_common_isnot4tap(dst + i * 1, stride);
857        }
858}
859
860static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
861        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
862{
863    double ftmp[18];
864    uint32_t tmp[1];
865    DECLARE_DOUBLE_1;
866    DECLARE_DOUBLE_2;
867    DECLARE_UINT32_T;
868    DECLARE_VAR_ALL64;
869
870    __asm__ volatile(
871        /* Get data from dst */
872        MMI_ULDC1(%[p3], %[dst], -0x04)
873        PTR_ADDU    "%[tmp0],     %[dst],           %[stride]     \n\t"
874        MMI_ULDC1(%[p2], %[tmp0], -0x04)
875        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
876        MMI_ULDC1(%[p1], %[tmp0], -0x04)
877        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
878        MMI_ULDC1(%[p0], %[tmp0], -0x04)
879        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
880        MMI_ULDC1(%[q0], %[tmp0], -0x04)
881        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
882        MMI_ULDC1(%[q1], %[tmp0], -0x04)
883        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
884        MMI_ULDC1(%[q2], %[tmp0], -0x04)
885        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
886        MMI_ULDC1(%[q3], %[tmp0], -0x04)
887        /* Matrix transpose */
888        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
889                     %[q0], %[q1], %[q2], %[q3],
890                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
891        MMI_VP8_LOOP_FILTER
892        /* Matrix transpose */
893        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
894                     %[q0], %[q1], %[q2], %[q3],
895                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
896        /* Move to dst */
897        MMI_USDC1(%[p3], %[dst], -0x04)
898        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
899        MMI_USDC1(%[p2], %[dst], -0x04)
900        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
901        MMI_USDC1(%[p1], %[dst], -0x04)
902        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
903        MMI_USDC1(%[p0], %[dst], -0x04)
904        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
905        MMI_USDC1(%[q0], %[dst], -0x04)
906        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
907        MMI_USDC1(%[q1], %[dst], -0x04)
908        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
909        MMI_USDC1(%[q2], %[dst], -0x04)
910        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
911        MMI_USDC1(%[q3], %[dst], -0x04)
912        : RESTRICT_ASM_ALL64
913          [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
914          [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
915          [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
916          [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
917          [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
918          [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
919          [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
920          [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
921          [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
922          [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
923          RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
924          RESTRICT_ASM_UINT32_T
925        : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
926          [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
927        : "memory"
928    );
929}
930
931static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
932        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
933{
934    int i;
935
936    for (i = 0; i < 8; i++)
937        if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
938            int hv = hev(dst + i * stride, 1, hev_thresh);
939            if (hv)
940                vp8_filter_common_is4tap(dst + i * stride, 1);
941            else
942                vp8_filter_common_isnot4tap(dst + i * stride, 1);
943        }
944}
945
946void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
947{
948#if 1
949    double ftmp[8];
950    DECLARE_VAR_ALL64;
951
952    __asm__ volatile (
953        MMI_LDC1(%[ftmp0], %[dc], 0x00)
954        MMI_LDC1(%[ftmp1], %[dc], 0x08)
955        MMI_LDC1(%[ftmp2], %[dc], 0x10)
956        MMI_LDC1(%[ftmp3], %[dc], 0x18)
957        "paddsh     %[ftmp4],   %[ftmp0],       %[ftmp3]            \n\t"
958        "psubsh     %[ftmp5],   %[ftmp0],       %[ftmp3]            \n\t"
959        "paddsh     %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
960        "psubsh     %[ftmp7],   %[ftmp1],       %[ftmp2]            \n\t"
961        "paddsh     %[ftmp0],   %[ftmp4],       %[ftmp6]            \n\t"
962        "paddsh     %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
963        "psubsh     %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
964        "psubsh     %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
965        MMI_SDC1(%[ftmp0], %[dc], 0x00)
966        MMI_SDC1(%[ftmp1], %[dc], 0x08)
967        MMI_SDC1(%[ftmp2], %[dc], 0x10)
968        MMI_SDC1(%[ftmp3], %[dc], 0x18)
969        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
970          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
971          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
972          [ftmp6]"=&f"(ftmp[6]),
973          RESTRICT_ASM_ALL64
974          [ftmp7]"=&f"(ftmp[7])
975        : [dc]"r"((uint8_t*)dc)
976        : "memory"
977    );
978
979    block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
980    block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
981    block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
982    block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
983
984    block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
985    block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
986    block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
987    block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
988
989    block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
990    block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
991    block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
992    block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
993
994    block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
995    block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
996    block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
997    block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
998
999    __asm__ volatile (
1000        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1001        MMI_SDC1(%[ftmp0], %[dc], 0x00)
1002        MMI_SDC1(%[ftmp0], %[dc], 0x08)
1003        MMI_SDC1(%[ftmp0], %[dc], 0x10)
1004        MMI_SDC1(%[ftmp0], %[dc], 0x18)
1005        : RESTRICT_ASM_ALL64
1006          [ftmp0]"=&f"(ftmp[0])
1007        : [dc]"r"((uint8_t *)dc)
1008        : "memory"
1009    );
1010#else
1011    int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1012
1013    t00 = dc[0] + dc[12];
1014    t10 = dc[1] + dc[13];
1015    t20 = dc[2] + dc[14];
1016    t30 = dc[3] + dc[15];
1017
1018    t03 = dc[0] - dc[12];
1019    t13 = dc[1] - dc[13];
1020    t23 = dc[2] - dc[14];
1021    t33 = dc[3] - dc[15];
1022
1023    t01 = dc[4] + dc[ 8];
1024    t11 = dc[5] + dc[ 9];
1025    t21 = dc[6] + dc[10];
1026    t31 = dc[7] + dc[11];
1027
1028    t02 = dc[4] - dc[ 8];
1029    t12 = dc[5] - dc[ 9];
1030    t22 = dc[6] - dc[10];
1031    t32 = dc[7] - dc[11];
1032
1033    dc[ 0] = t00 + t01;
1034    dc[ 1] = t10 + t11;
1035    dc[ 2] = t20 + t21;
1036    dc[ 3] = t30 + t31;
1037
1038    dc[ 4] = t03 + t02;
1039    dc[ 5] = t13 + t12;
1040    dc[ 6] = t23 + t22;
1041    dc[ 7] = t33 + t32;
1042
1043    dc[ 8] = t00 - t01;
1044    dc[ 9] = t10 - t11;
1045    dc[10] = t20 - t21;
1046    dc[11] = t30 - t31;
1047
1048    dc[12] = t03 - t02;
1049    dc[13] = t13 - t12;
1050    dc[14] = t23 - t22;
1051    dc[15] = t33 - t32;
1052
1053    block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1054    block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1055    block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1056    block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1057
1058    block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1059    block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1060    block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1061    block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1062
1063    block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1064    block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1065    block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1066    block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1067
1068    block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1069    block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1070    block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1071    block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1072
1073    AV_ZERO64(dc + 0);
1074    AV_ZERO64(dc + 4);
1075    AV_ZERO64(dc + 8);
1076    AV_ZERO64(dc + 12);
1077#endif
1078}
1079
1080void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1081{
1082    int val = (dc[0] + 3) >> 3;
1083
1084    dc[0] = 0;
1085
1086    block[0][0][0] = val;
1087    block[0][1][0] = val;
1088    block[0][2][0] = val;
1089    block[0][3][0] = val;
1090    block[1][0][0] = val;
1091    block[1][1][0] = val;
1092    block[1][2][0] = val;
1093    block[1][3][0] = val;
1094    block[2][0][0] = val;
1095    block[2][1][0] = val;
1096    block[2][2][0] = val;
1097    block[2][3][0] = val;
1098    block[3][0][0] = val;
1099    block[3][1][0] = val;
1100    block[3][2][0] = val;
1101    block[3][3][0] = val;
1102}
1103
1104void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1105{
1106#if 1
1107    double ftmp[12];
1108    uint32_t tmp[1];
1109    union av_intfloat64 ff_ph_4e7b_u;
1110    union av_intfloat64 ff_ph_22a3_u;
1111    DECLARE_VAR_LOW32;
1112    DECLARE_VAR_ALL64;
1113    ff_ph_4e7b_u.i = 0x4e7b4e7b4e7b4e7bULL;
1114    ff_ph_22a3_u.i = 0x22a322a322a322a3ULL;
1115
1116    __asm__ volatile (
1117        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1118        MMI_LDC1(%[ftmp1], %[block], 0x00)
1119        MMI_LDC1(%[ftmp2], %[block], 0x08)
1120        MMI_LDC1(%[ftmp3], %[block], 0x10)
1121        MMI_LDC1(%[ftmp4], %[block], 0x18)
1122
1123        "li         %[tmp0],    0x02                                \n\t"
1124        "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
1125
1126        // block[0...3] + block[8...11]
1127        "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
1128        // block[0...3] - block[8...11]
1129        "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
1130        // MUL_35468(block[12...15])
1131        "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
1132        "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1133        // MUL_35468(block[4...7])
1134        "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
1135        "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1136        // MUL_20091(block[4...7]
1137        "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
1138        "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
1139        // MUL_20091(block[12...15])
1140        "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
1141        "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
1142
1143        // tmp[0 4  8 12]
1144        "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
1145        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
1146        // tmp[1 5  9 13]
1147        "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
1148        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
1149        // tmp[2 6 10 14]
1150        "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
1151        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
1152        // tmp[3 7 11 15]
1153        "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
1154        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
1155
1156        MMI_SDC1(%[ftmp0], %[block], 0x00)
1157        MMI_SDC1(%[ftmp0], %[block], 0x08)
1158        MMI_SDC1(%[ftmp0], %[block], 0x10)
1159        MMI_SDC1(%[ftmp0], %[block], 0x18)
1160
1161        TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1162                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1163
1164        // t[0 4  8 12]
1165        "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
1166        // t[1 5  9 13]
1167        "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
1168        // t[2 6 10 14]
1169        "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
1170        "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1171        "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
1172        "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
1173        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
1174        // t[3 7 11 15]
1175        "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
1176        "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1177        "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
1178        "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
1179        "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
1180
1181        "li         %[tmp0],    0x03                                \n\t"
1182        "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
1183        "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
1184        "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_4]          \n\t"
1185        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
1186        "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
1187        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_4]          \n\t"
1188        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
1189        "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
1190        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"
1191        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
1192        "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
1193        "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_4]          \n\t"
1194        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
1195
1196        TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1197                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1198
1199        MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1200        MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1201        MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1202        MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1203
1204        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1205        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1206        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
1207        "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
1208
1209        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1210        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
1211        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1212        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
1213
1214        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1215        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1216        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1217        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1218
1219        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1220        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1221        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1222        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1223        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1224          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1225          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1226          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1227          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1228          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
1229          RESTRICT_ASM_LOW32
1230          RESTRICT_ASM_ALL64
1231          [tmp0]"=&r"(tmp[0])
1232        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
1233          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
1234          [block]"r"(block),                [ff_pw_4]"f"(ff_pw_4.f),
1235          [ff_ph_4e7b]"f"(ff_ph_4e7b_u.f),  [ff_ph_22a3]"f"(ff_ph_22a3_u.f)
1236        : "memory"
1237    );
1238#else
1239    int i, t0, t1, t2, t3;
1240    int16_t tmp[16];
1241
1242    for (i = 0; i < 4; i++) {
1243        t0 = block[0 + i] + block[8 + i];
1244        t1 = block[0 + i] - block[8 + i];
1245        t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1246        t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1247        block[ 0 + i] = 0;
1248        block[ 4 + i] = 0;
1249        block[ 8 + i] = 0;
1250        block[12 + i] = 0;
1251
1252        tmp[i * 4 + 0] = t0 + t3;
1253        tmp[i * 4 + 1] = t1 + t2;
1254        tmp[i * 4 + 2] = t1 - t2;
1255        tmp[i * 4 + 3] = t0 - t3;
1256    }
1257
1258    for (i = 0; i < 4; i++) {
1259        t0 = tmp[0 + i] + tmp[8 + i];
1260        t1 = tmp[0 + i] - tmp[8 + i];
1261        t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1262        t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1263
1264        dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1265        dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1266        dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1267        dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1268        dst   += stride;
1269    }
1270#endif
1271}
1272
1273void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1274{
1275#if 1
1276    int dc = (block[0] + 4) >> 3;
1277    double ftmp[6];
1278    DECLARE_VAR_LOW32;
1279
1280    block[0] = 0;
1281
1282    __asm__ volatile (
1283        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1284        "mtc1       %[dc],      %[ftmp5]                            \n\t"
1285        MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1286        MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1287        MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1288        MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1289        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1290        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1291        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1292        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1293        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1294        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1295        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
1296        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1297        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
1298        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1299        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1300        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1301        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1302        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1303        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1304        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1305        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1306        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1307          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1308          [ftmp4]"=&f"(ftmp[4]),
1309          RESTRICT_ASM_LOW32
1310          [ftmp5]"=&f"(ftmp[5])
1311        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
1312          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
1313          [dc]"r"(dc)
1314        : "memory"
1315    );
1316#else
1317    int i, dc = (block[0] + 4) >> 3;
1318
1319    block[0] = 0;
1320
1321    for (i = 0; i < 4; i++) {
1322        dst[0] = av_clip_uint8(dst[0] + dc);
1323        dst[1] = av_clip_uint8(dst[1] + dc);
1324        dst[2] = av_clip_uint8(dst[2] + dc);
1325        dst[3] = av_clip_uint8(dst[3] + dc);
1326        dst   += stride;
1327    }
1328#endif
1329}
1330
1331void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1332        ptrdiff_t stride)
1333{
1334    ff_vp8_idct_dc_add_mmi(dst +  0, block[0], stride);
1335    ff_vp8_idct_dc_add_mmi(dst +  4, block[1], stride);
1336    ff_vp8_idct_dc_add_mmi(dst +  8, block[2], stride);
1337    ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1338}
1339
1340void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1341        ptrdiff_t stride)
1342{
1343    ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1344    ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1345    ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1346    ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1347}
1348
1349// loop filter applied to edges between macroblocks
1350void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1351        int flim_I, int hev_thresh)
1352{
1353    vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1354    vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1355}
1356
1357void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1358        int flim_I, int hev_thresh)
1359{
1360    vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1361    vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1362                           hev_thresh);
1363}
1364
1365void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1366        int flim_E, int flim_I, int hev_thresh)
1367{
1368    vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1369    vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1370}
1371
1372void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1373        int flim_E, int flim_I, int hev_thresh)
1374{
1375    vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1376    vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1377}
1378
1379// loop filter applied to inner macroblock edges
1380void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1381        int flim_E, int flim_I, int hev_thresh)
1382{
1383    int i;
1384
1385    for (i = 0; i < 16; i++)
1386        if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1387            int hv = hev(dst + i * 1, stride, hev_thresh);
1388            if (hv)
1389                vp8_filter_common_is4tap(dst + i * 1, stride);
1390            else
1391                vp8_filter_common_isnot4tap(dst + i * 1, stride);
1392        }
1393}
1394
1395void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1396        int flim_E, int flim_I, int hev_thresh)
1397{
1398    int i;
1399
1400    for (i = 0; i < 16; i++)
1401        if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1402            int hv = hev(dst + i * stride, 1, hev_thresh);
1403            if (hv)
1404                vp8_filter_common_is4tap(dst + i * stride, 1);
1405            else
1406                vp8_filter_common_isnot4tap(dst + i * stride, 1);
1407        }
1408}
1409
1410void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1411        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1412{
1413    vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1414    vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1415}
1416
1417void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1418        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1419{
1420    vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1421    vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1422}
1423
1424void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1425{
1426    int i;
1427
1428    for (i = 0; i < 16; i++)
1429        if (vp8_simple_limit(dst + i, stride, flim))
1430            vp8_filter_common_is4tap(dst + i, stride);
1431}
1432
1433void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1434{
1435    int i;
1436
1437    for (i = 0; i < 16; i++)
1438        if (vp8_simple_limit(dst + i * stride, 1, flim))
1439            vp8_filter_common_is4tap(dst + i * stride, 1);
1440}
1441
1442void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1443        ptrdiff_t srcstride, int h, int x, int y)
1444{
1445#if 1
1446    double ftmp[2];
1447    uint64_t tmp[2];
1448    mips_reg addr[2];
1449    DECLARE_VAR_ALL64;
1450
1451    __asm__ volatile (
1452        "1:                                                         \n\t"
1453        PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1454        MMI_ULDC1(%[ftmp0], %[src], 0x00)
1455        "ldl        %[tmp0],    0x0f(%[src])                        \n\t"
1456        "ldr        %[tmp0],    0x08(%[src])                        \n\t"
1457        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1458        "ldl        %[tmp1],    0x0f(%[addr0])                      \n\t"
1459        "ldr        %[tmp1],    0x08(%[addr0])                      \n\t"
1460        PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1461        MMI_SDC1(%[ftmp0], %[dst], 0x00)
1462        "sdl        %[tmp0],    0x0f(%[dst])                        \n\t"
1463        "sdr        %[tmp0],    0x08(%[dst])                        \n\t"
1464        "addiu      %[h],       %[h],           -0x02               \n\t"
1465        MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1466        PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1467        "sdl        %[tmp1],    0x0f(%[addr1])                      \n\t"
1468        "sdr        %[tmp1],    0x08(%[addr1])                      \n\t"
1469        PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1470        "bnez       %[h],       1b                                  \n\t"
1471        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1472          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
1473          RESTRICT_ASM_ALL64
1474          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1475          [dst]"+&r"(dst),                  [src]"+&r"(src),
1476          [h]"+&r"(h)
1477        : [dststride]"r"((mips_reg)dststride),
1478          [srcstride]"r"((mips_reg)srcstride)
1479        : "memory"
1480    );
1481#else
1482    int i;
1483
1484    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1485        memcpy(dst, src, 16);
1486#endif
1487}
1488
1489void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1490        ptrdiff_t srcstride, int h, int x, int y)
1491{
1492#if 1
1493    double ftmp[1];
1494    uint64_t tmp[1];
1495    mips_reg addr[2];
1496    DECLARE_VAR_ALL64;
1497
1498    __asm__ volatile (
1499        "1:                                                         \n\t"
1500        PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1501        MMI_ULDC1(%[ftmp0], %[src], 0x00)
1502        "ldl        %[tmp0],    0x07(%[addr0])                      \n\t"
1503        "ldr        %[tmp0],    0x00(%[addr0])                      \n\t"
1504        PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1505        MMI_SDC1(%[ftmp0], %[dst], 0x00)
1506        "addiu      %[h],       %[h],           -0x02               \n\t"
1507        "sdl        %[tmp0],    0x07(%[addr1])                      \n\t"
1508        "sdr        %[tmp0],    0x00(%[addr1])                      \n\t"
1509        PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1510        PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1511        "bnez       %[h],       1b                                  \n\t"
1512        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
1513          RESTRICT_ASM_ALL64
1514          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1515          [dst]"+&r"(dst),                  [src]"+&r"(src),
1516          [h]"+&r"(h)
1517        : [dststride]"r"((mips_reg)dststride),
1518          [srcstride]"r"((mips_reg)srcstride)
1519        : "memory"
1520    );
1521#else
1522    int i;
1523
1524    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1525        memcpy(dst, src, 8);
1526#endif
1527}
1528
1529void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1530        ptrdiff_t srcstride, int h, int x, int y)
1531{
1532#if 1
1533    double ftmp[1];
1534    uint64_t tmp[1];
1535    mips_reg addr[2];
1536    DECLARE_VAR_LOW32;
1537
1538    __asm__ volatile (
1539        "1:                                                         \n\t"
1540        PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1541        MMI_LWC1(%[ftmp0], %[src], 0x00)
1542        "lwl        %[tmp0],    0x03(%[addr0])                      \n\t"
1543        "lwr        %[tmp0],    0x00(%[addr0])                      \n\t"
1544        PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1545        MMI_SWC1(%[ftmp0], %[dst], 0x00)
1546        "addiu      %[h],       %[h],           -0x02               \n\t"
1547        "swl        %[tmp0],    0x03(%[addr1])                      \n\t"
1548        "swr        %[tmp0],    0x00(%[addr1])                      \n\t"
1549        PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1550        PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1551        "bnez       %[h],       1b                                  \n\t"
1552        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
1553          RESTRICT_ASM_LOW32
1554          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1555          [dst]"+&r"(dst),                  [src]"+&r"(src),
1556          [h]"+&r"(h)
1557        : [dststride]"r"((mips_reg)dststride),
1558          [srcstride]"r"((mips_reg)srcstride)
1559        : "memory"
1560    );
1561#else
1562    int i;
1563
1564    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1565        memcpy(dst, src, 4);
1566#endif
1567}
1568
1569void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1570        ptrdiff_t srcstride, int h, int mx, int my)
1571{
1572#if 1
1573    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1574    double ftmp[9];
1575    uint32_t tmp[1];
1576    union av_intfloat64 filter1;
1577    union av_intfloat64 filter2;
1578    union av_intfloat64 filter3;
1579    union av_intfloat64 filter4;
1580    mips_reg src1, dst1;
1581    DECLARE_VAR_ALL64;
1582    filter1.i = filter[1];
1583    filter2.i = filter[2];
1584    filter3.i = filter[3];
1585    filter4.i = filter[4];
1586
1587    /*
1588    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1589    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1590    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1591    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1592    dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1593    dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1594    dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1595    dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1596
1597    dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1598    dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1599    dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1600    dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1601    dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1602    dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1603    dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1604    dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1605    */
1606    __asm__ volatile (
1607        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1608        "li         %[tmp0],    0x07                                \n\t"
1609        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1610
1611        "1:                                                         \n\t"
1612        // 0 - 7
1613        PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1614        PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
1615        PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
1616        // 8 - 15
1617        PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1618
1619        "addiu      %[h],       %[h],           -0x01               \n\t"
1620        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1621        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1622        "bnez       %[h],       1b                                  \n\t"
1623        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1624          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1625          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1626          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1627          [ftmp8]"=&f"(ftmp[8]),
1628          [tmp0]"=&r"(tmp[0]),
1629          RESTRICT_ASM_ALL64
1630          [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
1631          [h]"+&r"(h),
1632          [dst]"+&r"(dst),                  [src]"+&r"(src)
1633        : [ff_pw_64]"f"(ff_pw_64.f),
1634          [srcstride]"r"((mips_reg)srcstride),
1635          [dststride]"r"((mips_reg)dststride),
1636          [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
1637          [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
1638        : "memory"
1639    );
1640#else
1641    const uint8_t *filter = subpel_filters[mx - 1];
1642    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1643    int x, y;
1644
1645    for (y = 0; y < h; y++) {
1646        for (x = 0; x < 16; x++)
1647            dst[x] = FILTER_4TAP(src, filter, 1);
1648        dst += dststride;
1649        src += srcstride;
1650    }
1651#endif
1652}
1653
1654void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1655        ptrdiff_t srcstride, int h, int mx, int my)
1656{
1657#if 1
1658    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1659    double ftmp[9];
1660    uint32_t tmp[1];
1661    union av_intfloat64 filter1;
1662    union av_intfloat64 filter2;
1663    union av_intfloat64 filter3;
1664    union av_intfloat64 filter4;
1665    DECLARE_VAR_ALL64;
1666    filter1.i = filter[1];
1667    filter2.i = filter[2];
1668    filter3.i = filter[3];
1669    filter4.i = filter[4];
1670
1671
1672    /*
1673    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1674    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1675    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1676    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1677    dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1678    dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1679    dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1680    dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1681    */
1682    __asm__ volatile (
1683        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1684        "li         %[tmp0],    0x07                                \n\t"
1685        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1686
1687        "1:                                                         \n\t"
1688        PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1689
1690        "addiu      %[h],       %[h],           -0x01               \n\t"
1691        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1692        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1693        "bnez       %[h],       1b                                  \n\t"
1694        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1695          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1696          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1697          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1698          [ftmp8]"=&f"(ftmp[8]),
1699          [tmp0]"=&r"(tmp[0]),
1700          RESTRICT_ASM_ALL64
1701          [h]"+&r"(h),
1702          [dst]"+&r"(dst),                  [src]"+&r"(src)
1703        : [ff_pw_64]"f"(ff_pw_64.f),
1704          [srcstride]"r"((mips_reg)srcstride),
1705          [dststride]"r"((mips_reg)dststride),
1706          [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
1707          [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
1708        : "memory"
1709    );
1710#else
1711    const uint8_t *filter = subpel_filters[mx - 1];
1712    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1713    int x, y;
1714
1715    for (y = 0; y < h; y++) {
1716        for (x = 0; x < 8; x++)
1717            dst[x] = FILTER_4TAP(src, filter, 1);
1718        dst += dststride;
1719        src += srcstride;
1720    }
1721#endif
1722}
1723
1724void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1725        ptrdiff_t srcstride, int h, int mx, int my)
1726{
1727#if 1
1728    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1729    double ftmp[6];
1730    uint32_t tmp[1];
1731    union av_intfloat64 filter1;
1732    union av_intfloat64 filter2;
1733    union av_intfloat64 filter3;
1734    union av_intfloat64 filter4;
1735    DECLARE_VAR_LOW32;
1736    filter1.i = filter[1];
1737    filter2.i = filter[2];
1738    filter3.i = filter[3];
1739    filter4.i = filter[4];
1740
1741    /*
1742    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1743    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1744    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1745    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1746    */
1747    __asm__ volatile (
1748        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1749        "li         %[tmp0],    0x07                                \n\t"
1750        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1751
1752        "1:                                                         \n\t"
1753        PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1754
1755        "addiu      %[h],       %[h],           -0x01               \n\t"
1756        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1757        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1758        "bnez       %[h],       1b                                  \n\t"
1759        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1760          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1761          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1762          [tmp0]"=&r"(tmp[0]),
1763          RESTRICT_ASM_LOW32
1764          [h]"+&r"(h),
1765          [dst]"+&r"(dst),                  [src]"+&r"(src)
1766        : [ff_pw_64]"f"(ff_pw_64.f),
1767          [srcstride]"r"((mips_reg)srcstride),
1768          [dststride]"r"((mips_reg)dststride),
1769          [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
1770          [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
1771        : "memory"
1772    );
1773#else
1774    const uint8_t *filter = subpel_filters[mx - 1];
1775    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1776    int x, y;
1777
1778    for (y = 0; y < h; y++) {
1779        for (x = 0; x < 4; x++)
1780            dst[x] = FILTER_4TAP(src, filter, 1);
1781        dst += dststride;
1782        src += srcstride;
1783    }
1784#endif
1785}
1786
1787void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1788        ptrdiff_t srcstride, int h, int mx, int my)
1789{
1790#if 1
1791    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1792    double ftmp[9];
1793    uint32_t tmp[1];
1794    mips_reg src1, dst1;
1795    union av_intfloat64 filter0;
1796    union av_intfloat64 filter1;
1797    union av_intfloat64 filter2;
1798    union av_intfloat64 filter3;
1799    union av_intfloat64 filter4;
1800    union av_intfloat64 filter5;
1801    DECLARE_VAR_ALL64;
1802    filter0.i = filter[0];
1803    filter1.i = filter[1];
1804    filter2.i = filter[2];
1805    filter3.i = filter[3];
1806    filter4.i = filter[4];
1807    filter5.i = filter[5];
1808
1809    /*
1810    dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1811    dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1812    dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1813    dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1814    dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1815    dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1816    dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1817    dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1818
1819    dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1820    dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1821    dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1822    dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1823    dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1824    dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1825    dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1826    dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1827    */
1828    __asm__ volatile (
1829        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1830        "li         %[tmp0],    0x07                                \n\t"
1831        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1832
1833        "1:                                                         \n\t"
1834        // 0 - 7
1835        PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1836        PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
1837        PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
1838        // 8 - 15
1839        PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1840
1841        "addiu      %[h],       %[h],           -0x01               \n\t"
1842        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1843        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1844        "bnez       %[h],       1b                                  \n\t"
1845        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1846          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1847          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1848          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1849          [ftmp8]"=&f"(ftmp[8]),
1850          [tmp0]"=&r"(tmp[0]),
1851          RESTRICT_ASM_ALL64
1852          [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
1853          [h]"+&r"(h),
1854          [dst]"+&r"(dst),                  [src]"+&r"(src)
1855        : [ff_pw_64]"f"(ff_pw_64.f),
1856          [srcstride]"r"((mips_reg)srcstride),
1857          [dststride]"r"((mips_reg)dststride),
1858          [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
1859          [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
1860          [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
1861        : "memory"
1862    );
1863#else
1864    const uint8_t *filter = subpel_filters[mx - 1];
1865    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1866    int x, y;
1867
1868    for (y = 0; y < h; y++) {
1869        for (x = 0; x < 16; x++)
1870            dst[x] = FILTER_6TAP(src, filter, 1);
1871        dst += dststride;
1872        src += srcstride;
1873    }
1874#endif
1875}
1876
1877void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1878        ptrdiff_t srcstride, int h, int mx, int my)
1879{
1880#if 1
1881    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1882    double ftmp[9];
1883    uint32_t tmp[1];
1884    union av_intfloat64 filter0;
1885    union av_intfloat64 filter1;
1886    union av_intfloat64 filter2;
1887    union av_intfloat64 filter3;
1888    union av_intfloat64 filter4;
1889    union av_intfloat64 filter5;
1890    DECLARE_VAR_ALL64;
1891    filter0.i = filter[0];
1892    filter1.i = filter[1];
1893    filter2.i = filter[2];
1894    filter3.i = filter[3];
1895    filter4.i = filter[4];
1896    filter5.i = filter[5];
1897
1898    /*
1899    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1900    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1901    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1902    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1903    dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1904    dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1905    dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1906    dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1907    */
1908    __asm__ volatile (
1909        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1910        "li         %[tmp0],    0x07                                \n\t"
1911        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1912
1913        "1:                                                         \n\t"
1914        PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1915
1916        "addiu      %[h],       %[h],           -0x01               \n\t"
1917        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1918        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1919        "bnez       %[h],       1b                                  \n\t"
1920        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1921          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1922          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1923          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1924          [ftmp8]"=&f"(ftmp[8]),
1925          [tmp0]"=&r"(tmp[0]),
1926          RESTRICT_ASM_ALL64
1927          [h]"+&r"(h),
1928          [dst]"+&r"(dst),                  [src]"+&r"(src)
1929        : [ff_pw_64]"f"(ff_pw_64.f),
1930          [srcstride]"r"((mips_reg)srcstride),
1931          [dststride]"r"((mips_reg)dststride),
1932          [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
1933          [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
1934          [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
1935        : "memory"
1936    );
1937#else
1938    const uint8_t *filter = subpel_filters[mx - 1];
1939    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1940    int x, y;
1941
1942    for (y = 0; y < h; y++) {
1943        for (x = 0; x < 8; x++)
1944            dst[x] = FILTER_6TAP(src, filter, 1);
1945        dst += dststride;
1946        src += srcstride;
1947    }
1948#endif
1949}
1950
1951void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1952        ptrdiff_t srcstride, int h, int mx, int my)
1953{
1954#if 1
1955    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1956    double ftmp[6];
1957    uint32_t tmp[1];
1958    union av_intfloat64 filter0;
1959    union av_intfloat64 filter1;
1960    union av_intfloat64 filter2;
1961    union av_intfloat64 filter3;
1962    union av_intfloat64 filter4;
1963    union av_intfloat64 filter5;
1964    DECLARE_VAR_LOW32;
1965    filter0.i = filter[0];
1966    filter1.i = filter[1];
1967    filter2.i = filter[2];
1968    filter3.i = filter[3];
1969    filter4.i = filter[4];
1970    filter5.i = filter[5];
1971
1972    /*
1973    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1974    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1975    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1976    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1977    */
1978    __asm__ volatile (
1979        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1980        "li         %[tmp0],    0x07                                \n\t"
1981        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1982
1983        "1:                                                         \n\t"
1984        PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1985
1986        "addiu      %[h],       %[h],           -0x01               \n\t"
1987        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1988        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1989        "bnez       %[h],       1b                                  \n\t"
1990        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1991          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1992          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1993          [tmp0]"=&r"(tmp[0]),
1994          RESTRICT_ASM_LOW32
1995          [h]"+&r"(h),
1996          [dst]"+&r"(dst),                  [src]"+&r"(src)
1997        : [ff_pw_64]"f"(ff_pw_64.f),
1998          [srcstride]"r"((mips_reg)srcstride),
1999          [dststride]"r"((mips_reg)dststride),
2000          [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
2001          [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
2002          [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
2003        : "memory"
2004    );
2005#else
2006    const uint8_t *filter = subpel_filters[mx - 1];
2007    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2008    int x, y;
2009
2010    for (y = 0; y < h; y++) {
2011        for (x = 0; x < 4; x++)
2012            dst[x] = FILTER_6TAP(src, filter, 1);
2013        dst += dststride;
2014        src += srcstride;
2015    }
2016#endif
2017}
2018
2019void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2020        ptrdiff_t srcstride, int h, int mx, int my)
2021{
2022#if 1
2023    const uint64_t *filter = fourtap_subpel_filters[my - 1];
2024    double ftmp[9];
2025    uint32_t tmp[1];
2026    mips_reg src0, src1, dst0;
2027    union av_intfloat64 filter1;
2028    union av_intfloat64 filter2;
2029    union av_intfloat64 filter3;
2030    union av_intfloat64 filter4;
2031    DECLARE_VAR_ALL64;
2032    filter1.i = filter[1];
2033    filter2.i = filter[2];
2034    filter3.i = filter[3];
2035    filter4.i = filter[4];
2036
2037    /*
2038    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2039    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2040    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2041    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2042    dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2043    dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2044    dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2045    dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2046
2047    dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2048    dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2049    dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2050    dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2051    dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2052    dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2053    dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2054    dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2055    */
2056    __asm__ volatile (
2057        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2058        "li         %[tmp0],    0x07                                \n\t"
2059        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2060
2061        "1:                                                         \n\t"
2062        // 0 - 7
2063        PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2064        PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2065        PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2066        // 8 - 15
2067        PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2068
2069        "addiu      %[h],       %[h],           -0x01               \n\t"
2070        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2071        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2072        "bnez       %[h],       1b                                  \n\t"
2073        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2074          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2075          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2076          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2077          [ftmp8]"=&f"(ftmp[8]),
2078          [tmp0]"=&r"(tmp[0]),
2079          RESTRICT_ASM_ALL64
2080          [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
2081          [src1]"=&r"(src1),
2082          [h]"+&r"(h),
2083          [dst]"+&r"(dst),                  [src]"+&r"(src)
2084        : [ff_pw_64]"f"(ff_pw_64.f),
2085          [srcstride]"r"((mips_reg)srcstride),
2086          [dststride]"r"((mips_reg)dststride),
2087          [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
2088          [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
2089        : "memory"
2090    );
2091#else
2092    const uint8_t *filter = subpel_filters[my - 1];
2093    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2094    int x, y;
2095
2096    for (y = 0; y < h; y++) {
2097        for (x = 0; x < 16; x++)
2098            dst[x] = FILTER_4TAP(src, filter, srcstride);
2099        dst += dststride;
2100        src += srcstride;
2101    }
2102#endif
2103}
2104
2105void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2106        ptrdiff_t srcstride, int h, int mx, int my)
2107{
2108#if 1
2109    const uint64_t *filter = fourtap_subpel_filters[my - 1];
2110    double ftmp[9];
2111    uint32_t tmp[1];
2112    mips_reg src1;
2113    union av_intfloat64 filter1;
2114    union av_intfloat64 filter2;
2115    union av_intfloat64 filter3;
2116    union av_intfloat64 filter4;
2117    DECLARE_VAR_ALL64;
2118    filter1.i = filter[1];
2119    filter2.i = filter[2];
2120    filter3.i = filter[3];
2121    filter4.i = filter[4];
2122
2123    /*
2124    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2125    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2126    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2127    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2128    dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2129    dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2130    dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2131    dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2132    */
2133    __asm__ volatile (
2134        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2135        "li         %[tmp0],    0x07                                \n\t"
2136        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2137
2138        "1:                                                         \n\t"
2139        PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2140
2141        "addiu      %[h],       %[h],           -0x01               \n\t"
2142        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2143        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2144        "bnez       %[h],       1b                                  \n\t"
2145        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2146          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2147          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2148          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2149          [ftmp8]"=&f"(ftmp[8]),
2150          [tmp0]"=&r"(tmp[0]),
2151          RESTRICT_ASM_ALL64
2152          [src1]"=&r"(src1),
2153          [h]"+&r"(h),
2154          [dst]"+&r"(dst),                  [src]"+&r"(src)
2155        : [ff_pw_64]"f"(ff_pw_64.f),
2156          [srcstride]"r"((mips_reg)srcstride),
2157          [dststride]"r"((mips_reg)dststride),
2158          [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
2159          [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
2160        : "memory"
2161    );
2162#else
2163    const uint8_t *filter = subpel_filters[my - 1];
2164    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2165    int x, y;
2166
2167    for (y = 0; y < h; y++) {
2168        for (x = 0; x < 8; x++)
2169            dst[x] = FILTER_4TAP(src, filter, srcstride);
2170        dst += dststride;
2171        src += srcstride;
2172    }
2173#endif
2174}
2175
2176void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2177        ptrdiff_t srcstride, int h, int mx, int my)
2178{
2179#if 1
2180    const uint64_t *filter = fourtap_subpel_filters[my - 1];
2181    double ftmp[6];
2182    uint32_t tmp[1];
2183    mips_reg src1;
2184    union av_intfloat64 filter1;
2185    union av_intfloat64 filter2;
2186    union av_intfloat64 filter3;
2187    union av_intfloat64 filter4;
2188    DECLARE_VAR_LOW32;
2189    filter1.i = filter[1];
2190    filter2.i = filter[2];
2191    filter3.i = filter[3];
2192    filter4.i = filter[4];
2193
2194    /*
2195    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2196    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2197    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2198    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2199    */
2200    __asm__ volatile (
2201        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2202        "li         %[tmp0],    0x07                                \n\t"
2203        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2204
2205        "1:                                                         \n\t"
2206        PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2207
2208        "addiu      %[h],       %[h],           -0x01               \n\t"
2209        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2210        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2211        "bnez       %[h],       1b                                  \n\t"
2212        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2213          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2214          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2215          [tmp0]"=&r"(tmp[0]),
2216          RESTRICT_ASM_LOW32
2217          [src1]"=&r"(src1),
2218          [h]"+&r"(h),
2219          [dst]"+&r"(dst),                  [src]"+&r"(src)
2220        : [ff_pw_64]"f"(ff_pw_64.f),
2221          [srcstride]"r"((mips_reg)srcstride),
2222          [dststride]"r"((mips_reg)dststride),
2223          [filter1]"f"(filter1.f),          [filter2]"f"(filter2.f),
2224          [filter3]"f"(filter3.f),          [filter4]"f"(filter4.f)
2225        : "memory"
2226    );
2227#else
2228    const uint8_t *filter = subpel_filters[my - 1];
2229    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2230    int x, y;
2231
2232    for (y = 0; y < h; y++) {
2233        for (x = 0; x < 4; x++)
2234            dst[x] = FILTER_4TAP(src, filter, srcstride);
2235        dst += dststride;
2236        src += srcstride;
2237    }
2238#endif
2239}
2240
2241void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2242        ptrdiff_t srcstride, int h, int mx, int my)
2243{
2244#if 1
2245    const uint64_t *filter = fourtap_subpel_filters[my - 1];
2246    double ftmp[9];
2247    uint32_t tmp[1];
2248    mips_reg src0, src1, dst0;
2249    union av_intfloat64 filter0;
2250    union av_intfloat64 filter1;
2251    union av_intfloat64 filter2;
2252    union av_intfloat64 filter3;
2253    union av_intfloat64 filter4;
2254    union av_intfloat64 filter5;
2255    DECLARE_VAR_ALL64;
2256    filter0.i = filter[0];
2257    filter1.i = filter[1];
2258    filter2.i = filter[2];
2259    filter3.i = filter[3];
2260    filter4.i = filter[4];
2261    filter5.i = filter[5];
2262
2263    /*
2264    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2265    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2266    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2267    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2268    dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2269    dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2270    dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2271    dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2272
2273    dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2274    dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2275    dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2276    dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2277    dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2278    dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2279    dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2280    dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2281    */
2282    __asm__ volatile (
2283        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2284        "li         %[tmp0],    0x07                                \n\t"
2285        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2286
2287        "1:                                                         \n\t"
2288        // 0 - 7
2289        PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2290        PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2291        PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2292        // 8 - 15
2293        PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2294
2295        "addiu      %[h],       %[h],           -0x01               \n\t"
2296        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2297        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2298        "bnez       %[h],       1b                                  \n\t"
2299        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2300          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2301          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2302          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2303          [ftmp8]"=&f"(ftmp[8]),
2304          [tmp0]"=&r"(tmp[0]),
2305          RESTRICT_ASM_ALL64
2306          [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
2307          [src1]"=&r"(src1),
2308          [h]"+&r"(h),
2309          [dst]"+&r"(dst),                  [src]"+&r"(src)
2310        : [ff_pw_64]"f"(ff_pw_64.f),
2311          [srcstride]"r"((mips_reg)srcstride),
2312          [dststride]"r"((mips_reg)dststride),
2313          [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
2314          [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
2315          [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
2316        : "memory"
2317    );
2318#else
2319    const uint8_t *filter = subpel_filters[my - 1];
2320    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2321    int x, y;
2322
2323    for (y = 0; y < h; y++) {
2324        for (x = 0; x < 16; x++)
2325            dst[x] = FILTER_6TAP(src, filter, srcstride);
2326        dst += dststride;
2327        src += srcstride;
2328    }
2329#endif
2330}
2331
2332void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2333        ptrdiff_t srcstride, int h, int mx, int my)
2334{
2335#if 1
2336    const uint64_t *filter = fourtap_subpel_filters[my - 1];
2337    double ftmp[9];
2338    uint32_t tmp[1];
2339    mips_reg src1;
2340    union av_intfloat64 filter0;
2341    union av_intfloat64 filter1;
2342    union av_intfloat64 filter2;
2343    union av_intfloat64 filter3;
2344    union av_intfloat64 filter4;
2345    union av_intfloat64 filter5;
2346    DECLARE_VAR_ALL64;
2347    filter0.i = filter[0];
2348    filter1.i = filter[1];
2349    filter2.i = filter[2];
2350    filter3.i = filter[3];
2351    filter4.i = filter[4];
2352    filter5.i = filter[5];
2353
2354    /*
2355    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2356    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2357    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2358    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2359    dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2360    dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2361    dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2362    dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2363    */
2364    __asm__ volatile (
2365        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2366        "li         %[tmp0],    0x07                                \n\t"
2367        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2368
2369        "1:                                                         \n\t"
2370        PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2371
2372        "addiu      %[h],       %[h],           -0x01               \n\t"
2373        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2374        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2375        "bnez       %[h],       1b                                  \n\t"
2376        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2377          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2378          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2379          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2380          [ftmp8]"=&f"(ftmp[8]),
2381          [tmp0]"=&r"(tmp[0]),
2382          RESTRICT_ASM_ALL64
2383          [src1]"=&r"(src1),
2384          [h]"+&r"(h),
2385          [dst]"+&r"(dst),                  [src]"+&r"(src)
2386        : [ff_pw_64]"f"(ff_pw_64.f),
2387          [srcstride]"r"((mips_reg)srcstride),
2388          [dststride]"r"((mips_reg)dststride),
2389          [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
2390          [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
2391          [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
2392        : "memory"
2393    );
2394#else
2395    const uint8_t *filter = subpel_filters[my - 1];
2396    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2397    int x, y;
2398
2399    for (y = 0; y < h; y++) {
2400        for (x = 0; x < 8; x++)
2401            dst[x] = FILTER_6TAP(src, filter, srcstride);
2402        dst += dststride;
2403        src += srcstride;
2404    }
2405#endif
2406}
2407
2408void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2409        ptrdiff_t srcstride, int h, int mx, int my)
2410{
2411#if 1
2412    const uint64_t *filter = fourtap_subpel_filters[my - 1];
2413    double ftmp[6];
2414    uint32_t tmp[1];
2415    mips_reg src1;
2416    union av_intfloat64 filter0;
2417    union av_intfloat64 filter1;
2418    union av_intfloat64 filter2;
2419    union av_intfloat64 filter3;
2420    union av_intfloat64 filter4;
2421    union av_intfloat64 filter5;
2422    DECLARE_VAR_LOW32;
2423    filter0.i = filter[0];
2424    filter1.i = filter[1];
2425    filter2.i = filter[2];
2426    filter3.i = filter[3];
2427    filter4.i = filter[4];
2428    filter5.i = filter[5];
2429
2430    /*
2431    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2432    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2433    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2434    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2435    */
2436    __asm__ volatile (
2437        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2438        "li         %[tmp0],    0x07                                \n\t"
2439        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2440
2441        "1:                                                         \n\t"
2442        PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2443
2444        "addiu      %[h],       %[h],           -0x01               \n\t"
2445        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2446        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2447        "bnez       %[h],       1b                                  \n\t"
2448        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2449          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2450          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2451          [tmp0]"=&r"(tmp[0]),
2452          RESTRICT_ASM_LOW32
2453          [src1]"=&r"(src1),
2454          [h]"+&r"(h),
2455          [dst]"+&r"(dst),                  [src]"+&r"(src)
2456        : [ff_pw_64]"f"(ff_pw_64.f),
2457          [srcstride]"r"((mips_reg)srcstride),
2458          [dststride]"r"((mips_reg)dststride),
2459          [filter0]"f"(filter0.f),          [filter1]"f"(filter1.f),
2460          [filter2]"f"(filter2.f),          [filter3]"f"(filter3.f),
2461          [filter4]"f"(filter4.f),          [filter5]"f"(filter5.f)
2462        : "memory"
2463    );
2464#else
2465    const uint8_t *filter = subpel_filters[my - 1];
2466    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2467    int x, y;
2468
2469    for (y = 0; y < h; y++) {
2470        for (x = 0; x < 4; x++)
2471            dst[x] = FILTER_6TAP(src, filter, srcstride);
2472        dst += dststride;
2473        src += srcstride;
2474    }
2475#endif
2476}
2477
2478void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2479        ptrdiff_t srcstride, int h, int mx, int my)
2480{
2481#if 1
2482    DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2483    uint8_t *tmp = tmp_array;
2484
2485    src -= srcstride;
2486    ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2487    tmp = tmp_array + 16;
2488    ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2489#else
2490    const uint8_t *filter = subpel_filters[mx - 1];
2491    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2492    int x, y;
2493    uint8_t tmp_array[560];
2494    uint8_t *tmp = tmp_array;
2495
2496    src -= srcstride;
2497
2498    for (y = 0; y < h + 3; y++) {
2499        for (x = 0; x < 16; x++)
2500            tmp[x] = FILTER_4TAP(src, filter, 1);
2501        tmp += 16;
2502        src += srcstride;
2503    }
2504
2505    tmp    = tmp_array + 16;
2506    filter = subpel_filters[my - 1];
2507
2508    for (y = 0; y < h; y++) {
2509        for (x = 0; x < 16; x++)
2510            dst[x] = FILTER_4TAP(tmp, filter, 16);
2511        dst += dststride;
2512        tmp += 16;
2513    }
2514#endif
2515}
2516
2517void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2518        ptrdiff_t srcstride, int h, int mx, int my)
2519{
2520#if 1
2521    DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2522    uint8_t *tmp = tmp_array;
2523
2524    src -= srcstride;
2525    ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2526    tmp = tmp_array + 8;
2527    ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2528#else
2529    const uint8_t *filter = subpel_filters[mx - 1];
2530    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2531    int x, y;
2532    uint8_t tmp_array[152];
2533    uint8_t *tmp = tmp_array;
2534
2535    src -= srcstride;
2536
2537    for (y = 0; y < h + 3; y++) {
2538        for (x = 0; x < 8; x++)
2539            tmp[x] = FILTER_4TAP(src, filter, 1);
2540        tmp += 8;
2541        src += srcstride;
2542    }
2543
2544    tmp    = tmp_array + 8;
2545    filter = subpel_filters[my - 1];
2546
2547    for (y = 0; y < h; y++) {
2548        for (x = 0; x < 8; x++)
2549            dst[x] = FILTER_4TAP(tmp, filter, 8);
2550        dst += dststride;
2551        tmp += 8;
2552    }
2553#endif
2554}
2555
2556void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2557        ptrdiff_t srcstride, int h, int mx, int my)
2558{
2559#if 1
2560    DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2561    uint8_t *tmp = tmp_array;
2562
2563    src -= srcstride;
2564    ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2565    tmp = tmp_array + 4;
2566    ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2567#else
2568    const uint8_t *filter = subpel_filters[mx - 1];
2569    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2570    int x, y;
2571    uint8_t tmp_array[44];
2572    uint8_t *tmp = tmp_array;
2573
2574    src -= srcstride;
2575
2576    for (y = 0; y < h + 3; y++) {
2577        for (x = 0; x < 4; x++)
2578            tmp[x] = FILTER_4TAP(src, filter, 1);
2579        tmp += 4;
2580        src += srcstride;
2581    }
2582    tmp    = tmp_array + 4;
2583    filter = subpel_filters[my - 1];
2584
2585    for (y = 0; y < h; y++) {
2586        for (x = 0; x < 4; x++)
2587            dst[x] = FILTER_4TAP(tmp, filter, 4);
2588        dst += dststride;
2589        tmp += 4;
2590    }
2591#endif
2592}
2593
2594void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2595        ptrdiff_t srcstride, int h, int mx, int my)
2596{
2597#if 1
2598    DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2599    uint8_t *tmp = tmp_array;
2600
2601    src -= 2 * srcstride;
2602    ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2603    tmp    = tmp_array + 32;
2604    ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2605#else
2606    const uint8_t *filter = subpel_filters[mx - 1];
2607    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2608    int x, y;
2609    uint8_t tmp_array[592];
2610    uint8_t *tmp = tmp_array;
2611
2612    src -= 2 * srcstride;
2613
2614    for (y = 0; y < h + 5; y++) {
2615        for (x = 0; x < 16; x++)
2616            tmp[x] = FILTER_4TAP(src, filter, 1);
2617        tmp += 16;
2618        src += srcstride;
2619    }
2620
2621    tmp    = tmp_array + 32;
2622    filter = subpel_filters[my - 1];
2623
2624    for (y = 0; y < h; y++) {
2625        for (x = 0; x < 16; x++)
2626            dst[x] = FILTER_6TAP(tmp, filter, 16);
2627        dst += dststride;
2628        tmp += 16;
2629    }
2630#endif
2631}
2632
2633void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2634        ptrdiff_t srcstride, int h, int mx, int my)
2635{
2636#if 1
2637    DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2638    uint8_t *tmp = tmp_array;
2639
2640    src -= 2 * srcstride;
2641    ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2642    tmp    = tmp_array + 16;
2643    ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2644#else
2645    const uint8_t *filter = subpel_filters[mx - 1];
2646    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2647    int x, y;
2648    uint8_t tmp_array[168];
2649    uint8_t *tmp = tmp_array;
2650
2651    src -= 2 * srcstride;
2652
2653    for (y = 0; y < h + 5; y++) {
2654        for (x = 0; x < 8; x++)
2655            tmp[x] = FILTER_4TAP(src, filter, 1);
2656        tmp += 8;
2657        src += srcstride;
2658    }
2659
2660    tmp    = tmp_array + 16;
2661    filter = subpel_filters[my - 1];
2662
2663    for (y = 0; y < h; y++) {
2664        for (x = 0; x < 8; x++)
2665            dst[x] = FILTER_6TAP(tmp, filter, 8);
2666        dst += dststride;
2667        tmp += 8;
2668    }
2669#endif
2670}
2671
2672void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2673        ptrdiff_t srcstride, int h, int mx, int my)
2674{
2675#if 1
2676    DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2677    uint8_t *tmp = tmp_array;
2678
2679    src -= 2 * srcstride;
2680    ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2681    tmp    = tmp_array + 8;
2682    ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2683#else
2684    const uint8_t *filter = subpel_filters[mx - 1];
2685    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2686    int x, y;
2687    uint8_t tmp_array[52];
2688    uint8_t *tmp = tmp_array;
2689
2690    src -= 2 * srcstride;
2691
2692    for (y = 0; y < h + 5; y++) {
2693        for (x = 0; x < 4; x++)
2694            tmp[x] = FILTER_4TAP(src, filter, 1);
2695        tmp += 4;
2696        src += srcstride;
2697    }
2698
2699    tmp    = tmp_array + 8;
2700    filter = subpel_filters[my - 1];
2701
2702    for (y = 0; y < h; y++) {
2703        for (x = 0; x < 4; x++)
2704            dst[x] = FILTER_6TAP(tmp, filter, 4);
2705        dst += dststride;
2706        tmp += 4;
2707    }
2708#endif
2709}
2710
2711void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2712        ptrdiff_t srcstride, int h, int mx, int my)
2713{
2714#if 1
2715    DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2716    uint8_t *tmp = tmp_array;
2717
2718    src -= srcstride;
2719    ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2720    tmp    = tmp_array + 16;
2721    ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2722#else
2723    const uint8_t *filter = subpel_filters[mx - 1];
2724    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2725    int x, y;
2726    uint8_t tmp_array[560];
2727    uint8_t *tmp = tmp_array;
2728
2729    src -= srcstride;
2730
2731    for (y = 0; y < h + 3; y++) {
2732        for (x = 0; x < 16; x++)
2733            tmp[x] = FILTER_6TAP(src, filter, 1);
2734        tmp += 16;
2735        src += srcstride;
2736    }
2737
2738    tmp    = tmp_array + 16;
2739    filter = subpel_filters[my - 1];
2740
2741    for (y = 0; y < h; y++) {
2742        for (x = 0; x < 16; x++)
2743            dst[x] = FILTER_4TAP(tmp, filter, 16);
2744        dst += dststride;
2745        tmp += 16;
2746    }
2747#endif
2748}
2749
2750void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2751        ptrdiff_t srcstride, int h, int mx, int my)
2752{
2753#if 1
2754    DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2755    uint8_t *tmp = tmp_array;
2756
2757    src -= srcstride;
2758    ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2759    tmp    = tmp_array + 8;
2760    ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2761#else
2762    const uint8_t *filter = subpel_filters[mx - 1];
2763    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2764    int x, y;
2765    uint8_t tmp_array[152];
2766    uint8_t *tmp = tmp_array;
2767
2768    src -= srcstride;
2769
2770    for (y = 0; y < h + 3; y++) {
2771        for (x = 0; x < 8; x++)
2772            tmp[x] = FILTER_6TAP(src, filter, 1);
2773        tmp += 8;
2774        src += srcstride;
2775    }
2776
2777    tmp    = tmp_array + 8;
2778    filter = subpel_filters[my - 1];
2779
2780    for (y = 0; y < h; y++) {
2781        for (x = 0; x < 8; x++)
2782            dst[x] = FILTER_4TAP(tmp, filter, 8);
2783        dst += dststride;
2784        tmp += 8;
2785    }
2786#endif
2787}
2788
2789void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2790        ptrdiff_t srcstride, int h, int mx, int my)
2791{
2792#if 1
2793    DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2794    uint8_t *tmp = tmp_array;
2795
2796    src -= srcstride;
2797    ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2798    tmp    = tmp_array + 4;
2799    ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2800#else
2801    const uint8_t *filter = subpel_filters[mx - 1];
2802    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2803    int x, y;
2804    uint8_t tmp_array[44];
2805    uint8_t *tmp = tmp_array;
2806
2807    src -= srcstride;
2808
2809    for (y = 0; y < h + 3; y++) {
2810        for (x = 0; x < 4; x++)
2811            tmp[x] = FILTER_6TAP(src, filter, 1);
2812        tmp += 4;
2813        src += srcstride;
2814    }
2815
2816    tmp    = tmp_array + 4;
2817    filter = subpel_filters[my - 1];
2818
2819    for (y = 0; y < h; y++) {
2820        for (x = 0; x < 4; x++)
2821            dst[x] = FILTER_4TAP(tmp, filter, 4);
2822        dst += dststride;
2823        tmp += 4;
2824    }
2825#endif
2826}
2827
2828void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2829        ptrdiff_t srcstride, int h, int mx, int my)
2830{
2831#if 1
2832    DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2833    uint8_t *tmp = tmp_array;
2834
2835    src -= 2 * srcstride;
2836    ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2837    tmp    = tmp_array + 32;
2838    ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2839#else
2840    const uint8_t *filter = subpel_filters[mx - 1];
2841    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2842    int x, y;
2843    uint8_t tmp_array[592];
2844    uint8_t *tmp = tmp_array;
2845
2846    src -= 2 * srcstride;
2847
2848    for (y = 0; y < h + 5; y++) {
2849        for (x = 0; x < 16; x++)
2850            tmp[x] = FILTER_6TAP(src, filter, 1);
2851        tmp += 16;
2852        src += srcstride;
2853    }
2854
2855    tmp    = tmp_array + 32;
2856    filter = subpel_filters[my - 1];
2857
2858    for (y = 0; y < h; y++) {
2859        for (x = 0; x < 16; x++)
2860            dst[x] = FILTER_6TAP(tmp, filter, 16);
2861        dst += dststride;
2862        tmp += 16;
2863    }
2864#endif
2865}
2866
2867void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2868        ptrdiff_t srcstride, int h, int mx, int my)
2869{
2870#if 1
2871    DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2872    uint8_t *tmp = tmp_array;
2873
2874    src -= 2 * srcstride;
2875    ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2876    tmp    = tmp_array + 16;
2877    ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2878#else
2879    const uint8_t *filter = subpel_filters[mx - 1];
2880    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2881    int x, y;
2882    uint8_t tmp_array[168];
2883    uint8_t *tmp = tmp_array;
2884
2885    src -= 2 * srcstride;
2886
2887    for (y = 0; y < h + 5; y++) {
2888        for (x = 0; x < 8; x++)
2889            tmp[x] = FILTER_6TAP(src, filter, 1);
2890        tmp += 8;
2891        src += srcstride;
2892    }
2893
2894    tmp    = tmp_array + 16;
2895    filter = subpel_filters[my - 1];
2896
2897    for (y = 0; y < h; y++) {
2898        for (x = 0; x < 8; x++)
2899            dst[x] = FILTER_6TAP(tmp, filter, 8);
2900        dst += dststride;
2901        tmp += 8;
2902    }
2903#endif
2904}
2905
2906void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2907        ptrdiff_t srcstride, int h, int mx, int my)
2908{
2909#if 1
2910    DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2911    uint8_t *tmp = tmp_array;
2912
2913    src -= 2 * srcstride;
2914    ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2915    tmp    = tmp_array + 8;
2916    ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2917#else
2918    const uint8_t *filter = subpel_filters[mx - 1];
2919    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2920    int x, y;
2921    uint8_t tmp_array[52];
2922    uint8_t *tmp = tmp_array;
2923
2924    src -= 2 * srcstride;
2925
2926    for (y = 0; y < h + 5; y++) {
2927        for (x = 0; x < 4; x++)
2928            tmp[x] = FILTER_6TAP(src, filter, 1);
2929        tmp += 4;
2930        src += srcstride;
2931    }
2932
2933    tmp    = tmp_array + 8;
2934    filter = subpel_filters[my - 1];
2935
2936    for (y = 0; y < h; y++) {
2937        for (x = 0; x < 4; x++)
2938            dst[x] = FILTER_6TAP(tmp, filter, 4);
2939        dst += dststride;
2940        tmp += 4;
2941    }
2942#endif
2943}
2944
2945void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2946        ptrdiff_t sstride, int h, int mx, int my)
2947{
2948#if 1
2949    union mmi_intfloat64 a, b;
2950    double ftmp[7];
2951    uint32_t tmp[1];
2952    mips_reg dst0, src0;
2953    DECLARE_VAR_ALL64;
2954    a.i = 8 - mx;
2955    b.i = mx;
2956
2957    /*
2958    dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2959    dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2960    dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2961    dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2962    dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2963    dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2964    dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2965    dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2966
2967    dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2968    dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2969    dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2970    dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2971    dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2972    dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2973    dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2974    dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2975    */
2976    __asm__ volatile (
2977        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2978        "li         %[tmp0],    0x03                                \n\t"
2979        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2980        "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
2981        "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
2982
2983        "1:                                                         \n\t"
2984        // 0 - 7
2985        PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2986        PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2987        PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2988        // 8 - 15
2989        PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2990
2991        "addiu      %[h],       %[h],           -0x01               \n\t"
2992        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
2993        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
2994        "bnez       %[h],       1b                                  \n\t"
2995        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2996          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2997          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2998          [ftmp6]"=&f"(ftmp[6]),
2999          [tmp0]"=&r"(tmp[0]),
3000          RESTRICT_ASM_ALL64
3001          [dst0]"=&r"(dst0),            [src0]"=&r"(src0),
3002          [h]"+&r"(h),
3003          [dst]"+&r"(dst),              [src]"+&r"(src),
3004          [a]"+&f"(a.f),                [b]"+&f"(b.f)
3005        : [sstride]"r"((mips_reg)sstride),
3006          [dstride]"r"((mips_reg)dstride),
3007          [ff_pw_4]"f"(ff_pw_4.f)
3008        : "memory"
3009    );
3010#else
3011    int a = 8 - mx, b = mx;
3012    int x, y;
3013
3014    for (y = 0; y < h; y++) {
3015        for (x = 0; x < 16; x++)
3016            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3017        dst += dstride;
3018        src += sstride;
3019    }
3020#endif
3021}
3022
3023void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3024        ptrdiff_t sstride, int h, int mx, int my)
3025{
3026#if 1
3027    union mmi_intfloat64 c, d;
3028    double ftmp[7];
3029    uint32_t tmp[1];
3030    mips_reg src0, src1, dst0;
3031    DECLARE_VAR_ALL64;
3032    c.i = 8 - my;
3033    d.i = my;
3034
3035    /*
3036    dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3037    dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3038    dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3039    dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3040    dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3041    dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3042    dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3043    dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3044    */
3045    __asm__ volatile (
3046        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3047        "li         %[tmp0],    0x03                                \n\t"
3048        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3049        "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3050        "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3051
3052        "1:                                                         \n\t"
3053        // 0 - 7
3054        PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3055        PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
3056        PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
3057        // 8 - 15
3058        PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
3059
3060        "addiu      %[h],       %[h],           -0x01               \n\t"
3061        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3062        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3063        "bnez       %[h],       1b                                  \n\t"
3064        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3065          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3066          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3067          [ftmp6]"=&f"(ftmp[6]),
3068          [tmp0]"=&r"(tmp[0]),
3069          RESTRICT_ASM_ALL64
3070          [src0]"=&r"(src0),            [dst0]"=&r"(dst0),
3071          [src1]"=&r"(src1),
3072          [h]"+&r"(h),
3073          [dst]"+&r"(dst),              [src]"+&r"(src),
3074          [c]"+&f"(c.f),                [d]"+&f"(d.f)
3075        : [sstride]"r"((mips_reg)sstride),
3076          [dstride]"r"((mips_reg)dstride),
3077          [ff_pw_4]"f"(ff_pw_4.f)
3078        : "memory"
3079    );
3080#else
3081    int c = 8 - my, d = my;
3082    int x, y;
3083
3084    for (y = 0; y < h; y++) {
3085        for (x = 0; x < 16; x++)
3086            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3087        dst += dstride;
3088        src += sstride;
3089    }
3090#endif
3091}
3092
3093void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3094        ptrdiff_t sstride, int h, int mx, int my)
3095{
3096#if 1
3097    DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
3098    uint8_t *tmp = tmp_array;
3099
3100    ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
3101    ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
3102#else
3103    int a = 8 - mx, b = mx;
3104    int c = 8 - my, d = my;
3105    int x, y;
3106    uint8_t tmp_array[528];
3107    uint8_t *tmp = tmp_array;
3108
3109    for (y = 0; y < h + 1; y++) {
3110        for (x = 0; x < 16; x++)
3111            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3112        tmp += 16;
3113        src += sstride;
3114    }
3115
3116    tmp = tmp_array;
3117
3118    for (y = 0; y < h; y++) {
3119        for (x = 0; x < 16; x++)
3120            dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3121        dst += dstride;
3122        tmp += 16;
3123    }
3124#endif
3125}
3126
3127void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3128        ptrdiff_t sstride, int h, int mx, int my)
3129{
3130#if 1
3131    union mmi_intfloat64 a, b;
3132    double ftmp[7];
3133    uint32_t tmp[1];
3134    DECLARE_VAR_ALL64;
3135    a.i = 8 - mx;
3136    b.i = mx;
3137
3138    /*
3139    dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3140    dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3141    dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3142    dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3143    dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3144    dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3145    dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3146    dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3147    */
3148    __asm__ volatile (
3149        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3150        "li         %[tmp0],    0x03                                \n\t"
3151        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3152        "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
3153        "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
3154
3155        "1:                                                         \n\t"
3156        PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3157
3158        "addiu      %[h],       %[h],           -0x01               \n\t"
3159        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3160        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3161        "bnez       %[h],       1b                                  \n\t"
3162        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3163          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3164          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3165          [ftmp6]"=&f"(ftmp[6]),
3166          [tmp0]"=&r"(tmp[0]),
3167          RESTRICT_ASM_ALL64
3168          [h]"+&r"(h),
3169          [dst]"+&r"(dst),              [src]"+&r"(src),
3170          [a]"+&f"(a.f),                [b]"+&f"(b.f)
3171        : [sstride]"r"((mips_reg)sstride),
3172          [dstride]"r"((mips_reg)dstride),
3173          [ff_pw_4]"f"(ff_pw_4.f)
3174        : "memory"
3175    );
3176#else
3177    int a = 8 - mx, b = mx;
3178    int x, y;
3179
3180    for (y = 0; y < h; y++) {
3181        for (x = 0; x < 8; x++)
3182            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3183        dst += dstride;
3184        src += sstride;
3185    }
3186#endif
3187}
3188
3189void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3190        ptrdiff_t sstride, int h, int mx, int my)
3191{
3192#if 1
3193    union mmi_intfloat64 c, d;
3194    double ftmp[7];
3195    uint32_t tmp[1];
3196    mips_reg src1;
3197    DECLARE_VAR_ALL64;
3198    c.i = 8 - my;
3199    d.i = my;
3200
3201    /*
3202    dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3203    dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3204    dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3205    dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3206    dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3207    dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3208    dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3209    dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3210    */
3211    __asm__ volatile (
3212        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3213        "li         %[tmp0],    0x03                                \n\t"
3214        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3215        "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3216        "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3217
3218        "1:                                                         \n\t"
3219        PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3220
3221        "addiu      %[h],       %[h],           -0x01               \n\t"
3222        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3223        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3224        "bnez       %[h],       1b                                  \n\t"
3225        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3226          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3227          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3228          [ftmp6]"=&f"(ftmp[6]),
3229          [tmp0]"=&r"(tmp[0]),
3230          RESTRICT_ASM_ALL64
3231          [src1]"=&r"(src1),
3232          [h]"+&r"(h),
3233          [dst]"+&r"(dst),              [src]"+&r"(src),
3234          [c]"+&f"(c.f),                [d]"+&f"(d.f)
3235        : [sstride]"r"((mips_reg)sstride),
3236          [dstride]"r"((mips_reg)dstride),
3237          [ff_pw_4]"f"(ff_pw_4.f)
3238        : "memory"
3239    );
3240#else
3241    int c = 8 - my, d = my;
3242    int x, y;
3243
3244    for (y = 0; y < h; y++) {
3245        for (x = 0; x < 8; x++)
3246            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3247        dst += dstride;
3248        src += sstride;
3249    }
3250#endif
3251}
3252
3253void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3254        ptrdiff_t sstride, int h, int mx, int my)
3255{
3256#if 1
3257    DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3258    uint8_t *tmp = tmp_array;
3259
3260    ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3261    ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3262#else
3263    int a = 8 - mx, b = mx;
3264    int c = 8 - my, d = my;
3265    int x, y;
3266    uint8_t tmp_array[136];
3267    uint8_t *tmp = tmp_array;
3268
3269    for (y = 0; y < h + 1; y++) {
3270        for (x = 0; x < 8; x++)
3271            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3272        tmp += 8;
3273        src += sstride;
3274    }
3275
3276    tmp = tmp_array;
3277
3278    for (y = 0; y < h; y++) {
3279        for (x = 0; x < 8; x++)
3280            dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3281        dst += dstride;
3282        tmp += 8;
3283    }
3284#endif
3285}
3286
3287void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3288        ptrdiff_t sstride, int h, int mx, int my)
3289{
3290#if 1
3291    union mmi_intfloat64 a, b;
3292    double ftmp[5];
3293    uint32_t tmp[1];
3294    DECLARE_VAR_LOW32;
3295    DECLARE_VAR_ALL64;
3296    a.i = 8 - mx;
3297    b.i = mx;
3298
3299    /*
3300    dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3301    dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3302    dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3303    dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3304    */
3305    __asm__ volatile (
3306        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3307        "li         %[tmp0],    0x03                                \n\t"
3308        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3309        "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
3310        "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
3311
3312        "1:                                                         \n\t"
3313        PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3314
3315        "addiu      %[h],       %[h],           -0x01               \n\t"
3316        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3317        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3318        "bnez       %[h],       1b                                  \n\t"
3319        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3320          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3321          [ftmp4]"=&f"(ftmp[4]),
3322          [tmp0]"=&r"(tmp[0]),
3323          RESTRICT_ASM_LOW32
3324          RESTRICT_ASM_ALL64
3325          [h]"+&r"(h),
3326          [dst]"+&r"(dst),              [src]"+&r"(src),
3327          [a]"+&f"(a.f),                [b]"+&f"(b.f)
3328        : [sstride]"r"((mips_reg)sstride),
3329          [dstride]"r"((mips_reg)dstride),
3330          [ff_pw_4]"f"(ff_pw_4.f)
3331        : "memory"
3332    );
3333#else
3334    int a = 8 - mx, b = mx;
3335    int x, y;
3336
3337    for (y = 0; y < h; y++) {
3338        for (x = 0; x < 4; x++)
3339            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3340        dst += dstride;
3341        src += sstride;
3342    }
3343#endif
3344}
3345
3346void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3347        ptrdiff_t sstride, int h, int mx, int my)
3348{
3349#if 1
3350    union mmi_intfloat64 c, d;
3351    double ftmp[7];
3352    uint32_t tmp[1];
3353    mips_reg src1;
3354    DECLARE_VAR_LOW32;
3355    DECLARE_VAR_ALL64;
3356    c.i = 8 - my;
3357    d.i = my;
3358
3359    /*
3360    dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3361    dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3362    dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3363    dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3364    */
3365    __asm__ volatile (
3366        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3367        "li         %[tmp0],    0x03                                \n\t"
3368        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3369        "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3370        "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3371
3372        "1:                                                         \n\t"
3373        PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3374
3375        "addiu      %[h],       %[h],           -0x01               \n\t"
3376        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3377        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3378        "bnez       %[h],       1b                                  \n\t"
3379        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3380          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3381          [ftmp4]"=&f"(ftmp[4]),
3382          [tmp0]"=&r"(tmp[0]),
3383          RESTRICT_ASM_LOW32
3384          RESTRICT_ASM_ALL64
3385          [src1]"=&r"(src1),
3386          [h]"+&r"(h),
3387          [dst]"+&r"(dst),              [src]"+&r"(src),
3388          [c]"+&f"(c.f),                [d]"+&f"(d.f)
3389        : [sstride]"r"((mips_reg)sstride),
3390          [dstride]"r"((mips_reg)dstride),
3391          [ff_pw_4]"f"(ff_pw_4.f)
3392        : "memory"
3393    );
3394#else
3395    int c = 8 - my, d = my;
3396    int x, y;
3397
3398    for (y = 0; y < h; y++) {
3399        for (x = 0; x < 4; x++)
3400            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3401        dst += dstride;
3402        src += sstride;
3403    }
3404#endif
3405}
3406
3407void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3408        ptrdiff_t sstride, int h, int mx, int my)
3409{
3410#if 1
3411    DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3412    uint8_t *tmp = tmp_array;
3413
3414    ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3415    ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3416#else
3417    int a = 8 - mx, b = mx;
3418    int c = 8 - my, d = my;
3419    int x, y;
3420    uint8_t tmp_array[36];
3421    uint8_t *tmp = tmp_array;
3422
3423    for (y = 0; y < h + 1; y++) {
3424        for (x = 0; x < 4; x++)
3425            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3426        tmp += 4;
3427        src += sstride;
3428    }
3429
3430    tmp = tmp_array;
3431
3432    for (y = 0; y < h; y++) {
3433        for (x = 0; x < 4; x++)
3434            dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3435        dst += dstride;
3436        tmp += 4;
3437    }
3438#endif
3439}
3440