1/*
2 * Loongson SIMD optimized h264qpel
3 *
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "h264dsp_mips.h"
25#include "hpeldsp_mips.h"
26#include "libavcodec/bit_depth_template.c"
27#include "libavutil/mips/mmiutils.h"
28
29static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src,
30        int dstStride, int srcStride, int h)
31{
32    double ftmp[1];
33    DECLARE_VAR_LOW32;
34
35    __asm__ volatile (
36        "1:                                                             \n\t"
37        MMI_ULWC1(%[ftmp0], %[src], 0x00)
38        MMI_SWC1(%[ftmp0], %[dst], 0x00)
39        "addi       %[h],       %[h],           -0x01                   \n\t"
40        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
41        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
42        "bnez       %[h],       1b                                      \n\t"
43        : [ftmp0]"=&f"(ftmp[0]),
44          [dst]"+&r"(dst),                  [src]"+&r"(src),
45          RESTRICT_ASM_LOW32
46          [h]"+&r"(h)
47        : [dstStride]"r"((mips_reg)dstStride),
48          [srcStride]"r"((mips_reg)srcStride)
49        : "memory"
50    );
51}
52
53static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src,
54        int dstStride, int srcStride, int h)
55{
56    double ftmp[1];
57    DECLARE_VAR_ALL64;
58
59    __asm__ volatile (
60        "1:                                                             \n\t"
61        MMI_ULDC1(%[ftmp0], %[src], 0x00)
62        MMI_SDC1(%[ftmp0], %[dst], 0x00)
63        "addi       %[h],       %[h],           -0x01                   \n\t"
64        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
65        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
66        "bnez       %[h],       1b                                      \n\t"
67        : [ftmp0]"=&f"(ftmp[0]),
68          RESTRICT_ASM_ALL64
69          [dst]"+&r"(dst),                  [src]"+&r"(src),
70          [h]"+&r"(h)
71        : [dstStride]"r"((mips_reg)dstStride),
72          [srcStride]"r"((mips_reg)srcStride)
73        : "memory"
74    );
75}
76
77static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src,
78        int dstStride, int srcStride, int h)
79{
80    double ftmp[1];
81    uint64_t tmp[1];
82    DECLARE_VAR_ALL64;
83
84    __asm__ volatile (
85        "1:                                                             \n\t"
86        MMI_ULDC1(%[ftmp0], %[src], 0x00)
87        "ldl        %[tmp0],    0x0f(%[src])                            \n\t"
88        "ldr        %[tmp0],    0x08(%[src])                            \n\t"
89        MMI_SDC1(%[ftmp0], %[dst], 0x00)
90        "sdl        %[tmp0],    0x0f(%[dst])                            \n\t"
91        "sdr        %[tmp0],    0x08(%[dst])                            \n\t"
92        "addi       %[h],       %[h],           -0x01                   \n\t"
93        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
94        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
95        "bnez       %[h],       1b                                      \n\t"
96        : [ftmp0]"=&f"(ftmp[0]),
97          [tmp0]"=&r"(tmp[0]),
98          RESTRICT_ASM_ALL64
99          [dst]"+&r"(dst),                  [src]"+&r"(src),
100          [h]"+&r"(h)
101        : [dstStride]"r"((mips_reg)dstStride),
102          [srcStride]"r"((mips_reg)srcStride)
103        : "memory"
104    );
105}
106
107#define op2_avg(a, b)  a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
108#define op2_put(a, b)  a = CLIP(((b) + 512)>>10)
109static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
110        int dstStride, int srcStride)
111{
112    double ftmp[10];
113    uint64_t tmp[1];
114    DECLARE_VAR_LOW32;
115
116    __asm__ volatile (
117        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
118        "dli        %[tmp0],    0x04                                    \n\t"
119        "1:                                                             \n\t"
120        MMI_ULWC1(%[ftmp1], %[src], -0x02)
121        MMI_ULWC1(%[ftmp2], %[src], -0x01)
122        MMI_ULWC1(%[ftmp3], %[src],  0x00)
123        MMI_ULWC1(%[ftmp4], %[src],  0x01)
124        MMI_ULWC1(%[ftmp5], %[src],  0x02)
125        MMI_ULWC1(%[ftmp6], %[src],  0x03)
126
127        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
128        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
129        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
130        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
131        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
132        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
133        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
134        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
135        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
136        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
137        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
138        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
139        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
140        "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
141        "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
142        "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
143        MMI_SWC1(%[ftmp9], %[dst],  0x00)
144        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
145        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
146        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
147        "bnez       %[tmp0],    1b                                      \n\t"
148        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
149          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
150          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
151          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
152          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
153          [tmp0]"=&r"(tmp[0]),
154          RESTRICT_ASM_LOW32
155          [dst]"+&r"(dst),                  [src]"+&r"(src)
156        : [dstStride]"r"((mips_reg)dstStride),
157          [srcStride]"r"((mips_reg)srcStride),
158          [ff_pw_20]"f"(ff_pw_20.f),        [ff_pw_5]"f"(ff_pw_5.f),
159          [ff_pw_16]"f"(ff_pw_16.f)
160        : "memory"
161    );
162}
163
164static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
165        int dstStride, int srcStride)
166{
167    double ftmp[11];
168    uint64_t tmp[1];
169    DECLARE_VAR_ALL64;
170
171    __asm__ volatile (
172        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
173        "dli        %[tmp0],    0x08                                    \n\t"
174        "1:                                                             \n\t"
175        MMI_ULDC1(%[ftmp1], %[src], -0x02)
176        MMI_ULDC1(%[ftmp2], %[src], -0x01)
177        MMI_ULDC1(%[ftmp3], %[src],  0x00)
178        MMI_ULDC1(%[ftmp4], %[src],  0x01)
179        MMI_ULDC1(%[ftmp5], %[src],  0x02)
180        MMI_ULDC1(%[ftmp6], %[src],  0x03)
181        "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
182        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
183        "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
184        "punpckhbh  %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
185        "paddsh     %[ftmp3],   %[ftmp7],       %[ftmp9]                \n\t"
186        "paddsh     %[ftmp4],   %[ftmp8],       %[ftmp10]               \n\t"
187        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_20]             \n\t"
188        "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_20]             \n\t"
189        "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
190        "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]                \n\t"
191        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
192        "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
193        "paddsh     %[ftmp2],   %[ftmp7],       %[ftmp9]                \n\t"
194        "paddsh     %[ftmp5],   %[ftmp8],       %[ftmp10]               \n\t"
195        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
196        "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_5]              \n\t"
197        "punpcklbh  %[ftmp7],   %[ftmp1],       %[ftmp0]                \n\t"
198        "punpckhbh  %[ftmp8],   %[ftmp1],       %[ftmp0]                \n\t"
199        "punpcklbh  %[ftmp9],   %[ftmp6],       %[ftmp0]                \n\t"
200        "punpckhbh  %[ftmp10],  %[ftmp6],       %[ftmp0]                \n\t"
201        "paddsh     %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
202        "paddsh     %[ftmp6],   %[ftmp8],       %[ftmp10]               \n\t"
203        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
204        "psubsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
205        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
206        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
207        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
208        "paddsh     %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
209        "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
210        "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
211        "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
212        MMI_SDC1(%[ftmp9], %[dst],  0x00)
213        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
214        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
215        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
216        "bnez       %[tmp0],    1b                                      \n\t"
217        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
218          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
219          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
220          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
221          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
222          [ftmp10]"=&f"(ftmp[10]),
223          [tmp0]"=&r"(tmp[0]),
224          RESTRICT_ASM_ALL64
225          [dst]"+&r"(dst),                  [src]"+&r"(src)
226        : [dstStride]"r"((mips_reg)dstStride),
227          [srcStride]"r"((mips_reg)srcStride),
228          [ff_pw_20]"f"(ff_pw_20.f),        [ff_pw_5]"f"(ff_pw_5.f),
229          [ff_pw_16]"f"(ff_pw_16.f)
230        : "memory"
231    );
232}
233
234static void put_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
235        int dstStride, int srcStride)
236{
237    put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
238    put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
239    src += 8*srcStride;
240    dst += 8*dstStride;
241    put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
242    put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
243}
244
245static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
246        int dstStride, int srcStride)
247{
248    double ftmp[11];
249    uint64_t tmp[1];
250    DECLARE_VAR_LOW32;
251
252    __asm__ volatile (
253        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
254        "dli        %[tmp0],    0x04                                    \n\t"
255        "1:                                                             \n\t"
256        MMI_ULWC1(%[ftmp1], %[src], -0x02)
257        MMI_ULWC1(%[ftmp2], %[src], -0x01)
258        MMI_ULWC1(%[ftmp3], %[src],  0x00)
259        MMI_ULWC1(%[ftmp4], %[src],  0x01)
260        MMI_ULWC1(%[ftmp5], %[src],  0x02)
261        MMI_ULWC1(%[ftmp6], %[src],  0x03)
262        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
263        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
264        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
265        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
266        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
267        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
268        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
269        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
270        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
271        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
272        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
273        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
274        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
275        "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
276        "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
277        "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
278        MMI_LWC1(%[ftmp10], %[dst],  0x00)
279        "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
280        MMI_SWC1(%[ftmp9], %[dst],  0x00)
281        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
282        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
283        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
284        "bnez       %[tmp0],    1b                                      \n\t"
285        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
286          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
287          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
288          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
289          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
290          [ftmp10]"=&f"(ftmp[10]),
291          [tmp0]"=&r"(tmp[0]),
292          RESTRICT_ASM_LOW32
293          [dst]"+&r"(dst),                  [src]"+&r"(src)
294        : [dstStride]"r"((mips_reg)dstStride),
295          [srcStride]"r"((mips_reg)srcStride),
296          [ff_pw_20]"f"(ff_pw_20.f),        [ff_pw_5]"f"(ff_pw_5.f),
297          [ff_pw_16]"f"(ff_pw_16.f)
298        : "memory"
299    );
300}
301
302static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
303        int dstStride, int srcStride)
304{
305    double ftmp[11];
306    uint64_t tmp[1];
307    DECLARE_VAR_ALL64;
308
309    __asm__ volatile (
310        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
311        "dli        %[tmp0],    0x08                                    \n\t"
312        "1:                                                             \n\t"
313        MMI_ULDC1(%[ftmp1], %[src], -0x02)
314        MMI_ULDC1(%[ftmp2], %[src], -0x01)
315        MMI_ULDC1(%[ftmp3], %[src],  0x00)
316        MMI_ULDC1(%[ftmp4], %[src],  0x01)
317        MMI_ULDC1(%[ftmp5], %[src],  0x02)
318        MMI_ULDC1(%[ftmp6], %[src],  0x03)
319        "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
320        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
321        "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
322        "punpckhbh  %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
323        "paddsh     %[ftmp3],   %[ftmp7],       %[ftmp9]                \n\t"
324        "paddsh     %[ftmp4],   %[ftmp8],       %[ftmp10]               \n\t"
325        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_20]             \n\t"
326        "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_20]             \n\t"
327        "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
328        "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]                \n\t"
329        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
330        "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
331        "paddsh     %[ftmp2],   %[ftmp7],       %[ftmp9]                \n\t"
332        "paddsh     %[ftmp5],   %[ftmp8],       %[ftmp10]               \n\t"
333        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
334        "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_5]              \n\t"
335        "punpcklbh  %[ftmp7],   %[ftmp1],       %[ftmp0]                \n\t"
336        "punpckhbh  %[ftmp8],   %[ftmp1],       %[ftmp0]                \n\t"
337        "punpcklbh  %[ftmp9],   %[ftmp6],       %[ftmp0]                \n\t"
338        "punpckhbh  %[ftmp10],  %[ftmp6],       %[ftmp0]                \n\t"
339        "paddsh     %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
340        "paddsh     %[ftmp6],   %[ftmp8],       %[ftmp10]               \n\t"
341        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
342        "psubsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
343        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
344        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
345        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
346        "paddsh     %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
347        "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
348        "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
349        "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
350        MMI_LDC1(%[ftmp10], %[dst], 0x00)
351        "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
352        MMI_SDC1(%[ftmp9], %[dst], 0x00)
353        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
354        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
355        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
356        "bnez       %[tmp0],    1b                                      \n\t"
357        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
358          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
359          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
360          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
361          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
362          [ftmp10]"=&f"(ftmp[10]),
363          [tmp0]"=&r"(tmp[0]),
364          RESTRICT_ASM_ALL64
365          [dst]"+&r"(dst),                  [src]"+&r"(src)
366        : [dstStride]"r"((mips_reg)dstStride),
367          [srcStride]"r"((mips_reg)srcStride),
368          [ff_pw_20]"f"(ff_pw_20.f),        [ff_pw_5]"f"(ff_pw_5.f),
369          [ff_pw_16]"f"(ff_pw_16.f)
370        : "memory"
371    );
372}
373
374static void avg_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
375        int dstStride, int srcStride)
376{
377    avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
378    avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
379    src += 8*srcStride;
380    dst += 8*dstStride;
381    avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
382    avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
383}
384
385static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
386        int dstStride, int srcStride)
387{
388    double ftmp[12];
389    uint64_t tmp[1];
390    DECLARE_VAR_LOW32;
391
392    src -= 2 * srcStride;
393
394    __asm__ volatile (
395        ".set       push                                                \n\t"
396        ".set       noreorder                                           \n\t"
397        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
398        "dli        %[tmp0],    0x02                                    \n\t"
399        MMI_LWC1(%[ftmp1], %[src], 0x00)
400        "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
401        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
402        "dli        %[tmp0],    0x05                                    \n\t"
403        MMI_LWC1(%[ftmp2], %[src], 0x00)
404        "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
405        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
406        MMI_LWC1(%[ftmp3], %[src], 0x00)
407        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
408        MMI_LWC1(%[ftmp4], %[src], 0x00)
409        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
410        MMI_LWC1(%[ftmp5], %[src], 0x00)
411        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
412        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
413        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
414        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
415        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
416        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
417        MMI_LWC1(%[ftmp6], %[src], 0x00)
418        "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
419        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
420        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
421        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
422        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
423        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
424        "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]             \n\t"
425        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
426        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
427        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
428        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
429        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
430        MMI_SWC1(%[ftmp7], %[dst], 0x00)
431        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
432        MMI_LWC1(%[ftmp1], %[src], 0x00)
433        "paddh      %[ftmp7],   %[ftmp4],       %[ftmp5]                \n\t"
434        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
435        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
436        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
437        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
438        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
439        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
440        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
441        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
442        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
443        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
444        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
445        MMI_SWC1(%[ftmp7], %[dst], 0x00)
446        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
447        MMI_LWC1(%[ftmp2], %[src], 0x00)
448        "paddh      %[ftmp7],   %[ftmp5],       %[ftmp6]                \n\t"
449        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
450        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
451        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
452        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
453        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
454        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
455        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
456        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
457        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
458        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
459        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
460        MMI_SWC1(%[ftmp7], %[dst], 0x00)
461        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
462        MMI_LWC1(%[ftmp3], %[src], 0x00)
463        "paddh      %[ftmp7],   %[ftmp6],       %[ftmp1]                \n\t"
464        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
465        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
466        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
467        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
468        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
469        "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
470        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
471        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
472        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
473        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
474        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
475        MMI_SWC1(%[ftmp7], %[dst], 0x00)
476        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
477        ".set       pop                                                 \n\t"
478        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
479          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
480          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
481          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
482          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
483          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
484          [tmp0]"=&r"(tmp[0]),
485          RESTRICT_ASM_LOW32
486          [dst]"+&r"(dst),                  [src]"+&r"(src)
487        : [dstStride]"r"((mips_reg)dstStride),
488          [srcStride]"r"((mips_reg)srcStride),
489          [ff_pw_5]"f"(ff_pw_5.f),          [ff_pw_16]"f"(ff_pw_16.f)
490        : "memory"
491    );
492}
493
494static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
495        int dstStride, int srcStride)
496{
497    int w = 2;
498    int h = 8;
499    double ftmp[10];
500    uint64_t tmp[1];
501    DECLARE_VAR_LOW32;
502
503    src -= 2 * srcStride;
504
505    while (w--) {
506        __asm__ volatile (
507            ".set       push                                            \n\t"
508            ".set       noreorder                                       \n\t"
509            "dli        %[tmp0],    0x02                                \n\t"
510            MMI_LWC1(%[ftmp0], %[src], 0x00)
511            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
512            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
513            "dli        %[tmp0],    0x05                                \n\t"
514            MMI_LWC1(%[ftmp1], %[src], 0x00)
515            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
516            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
517            MMI_LWC1(%[ftmp2], %[src], 0x00)
518            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
519            "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
520            MMI_LWC1(%[ftmp3], %[src], 0x00)
521            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
522            MMI_LWC1(%[ftmp4], %[src], 0x00)
523            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
524            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
525            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
526            MMI_LWC1(%[ftmp5], %[src], 0x00)
527            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
528            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
529            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
530            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
531            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
532            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
533            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
534            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
535            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
536            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
537            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
538            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
539            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
540            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
541            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
542            MMI_SWC1(%[ftmp6], %[dst], 0x00)
543            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
544            MMI_LWC1(%[ftmp0], %[src], 0x00)
545            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
546            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
547            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
548            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
549            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
550            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
551            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
552            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
553            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
554            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
555            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
556            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
557            MMI_SWC1(%[ftmp6], %[dst], 0x00)
558            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
559            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
560            MMI_LWC1(%[ftmp1], %[src], 0x00)
561            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
562            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
563            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
564            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
565            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
566            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
567            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
568            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
569            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
570            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
571            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
572            MMI_SWC1(%[ftmp6], %[dst], 0x00)
573            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
574            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
575            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
576            MMI_LWC1(%[ftmp2], %[src], 0x00)
577            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
578            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
579            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
580            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
581            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
582            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
583            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
584            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
585            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
586            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
587            MMI_SWC1(%[ftmp6], %[dst], 0x00)
588            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
589            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
590            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
591            MMI_LWC1(%[ftmp3], %[src], 0x00)
592            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
593            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
594            "punpcklbh  %[ftmp3] ,  %[ftmp3],       %[ftmp7]            \n\t"
595            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
596            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
597            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
598            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
599            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
600            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
601            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
602            MMI_SWC1(%[ftmp6], %[dst], 0x00)
603            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
604            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
605            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
606            MMI_LWC1(%[ftmp4], %[src], 0x00)
607            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
608            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
609            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
610            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
611            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
612            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
613            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
614            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
615            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
616            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
617            MMI_SWC1(%[ftmp6], %[dst], 0x00)
618            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
619            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
620            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
621            MMI_LWC1(%[ftmp5], %[src], 0x00)
622            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
623            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
624            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
625            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
626            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
627            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
628            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
629            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
630            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
631            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
632            MMI_SWC1(%[ftmp6], %[dst], 0x00)
633            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
634            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
635            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
636            MMI_LWC1(%[ftmp0], %[src], 0x00)
637            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
638            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
639            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
640            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
641            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
642            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
643            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
644            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
645            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
646            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
647            MMI_SWC1(%[ftmp6], %[dst], 0x00)
648            "bne        %[h],       0x10,           2f                  \n\t"
649            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
650            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
651            MMI_LWC1(%[ftmp1], %[src], 0x00)
652            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
653            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
654            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
655            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
656            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
657            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
658            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
659            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
660            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
661            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
662            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
663            MMI_SWC1(%[ftmp6], %[dst], 0x00)
664            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
665            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
666            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
667            MMI_LWC1(%[ftmp2], %[src], 0x00)
668            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
669            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
670            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
671            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
672            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
673            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
674            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
675            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
676            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
677            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
678            MMI_SWC1(%[ftmp6], %[dst], 0x00)
679            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
680            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
681            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
682            MMI_LWC1(%[ftmp3], %[src], 0x00)
683            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
684            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
685            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
686            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
687            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
688            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
689            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
690            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
691            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
692            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
693            MMI_SWC1(%[ftmp6], %[dst], 0x00)
694            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
695            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
696            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
697            MMI_LWC1(%[ftmp4], %[src], 0x00)
698            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
699            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
700            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
701            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
702            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
703            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
704            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
705            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
706            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
707            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
708            MMI_SWC1(%[ftmp6], %[dst], 0x00)
709            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
710            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
711            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
712            MMI_LWC1(%[ftmp5], %[src], 0x00)
713            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
714            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
715            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
716            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
717            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
718            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
719            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
720            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
721            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
722            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
723            MMI_SWC1(%[ftmp6], %[dst], 0x00)
724            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
725            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
726            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
727            MMI_LWC1(%[ftmp0], %[src], 0x00)
728            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
729            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
730            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
731            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
732            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
733            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
734            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
735            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
736            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
737            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
738            MMI_SWC1(%[ftmp6], %[dst], 0x00)
739            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
740            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
741            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
742            MMI_LWC1(%[ftmp1], %[src], 0x00)
743            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
744            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
745            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
746            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
747            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
748            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
749            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
750            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
751            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
752            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
753            MMI_SWC1(%[ftmp6], %[dst], 0x00)
754            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
755            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
756            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
757            MMI_LWC1(%[ftmp2], %[src], 0x00)
758            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
759            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
760            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
761            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
762            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
763            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
764            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
765            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
766            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
767            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
768            MMI_SWC1(%[ftmp6], %[dst], 0x00)
769            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
770            "2:                                                         \n\t"
771            ".set       pop                                             \n\t"
772            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
773              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
774              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
775              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
776              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
777              [tmp0]"=&r"(tmp[0]),
778              RESTRICT_ASM_LOW32
779              [src]"+&r"(src),              [dst]"+&r"(dst),
780              [h]"+&r"(h)
781            : [dstStride]"r"((mips_reg)dstStride),
782              [srcStride]"r"((mips_reg)srcStride),
783              [ff_pw_5]"f"(ff_pw_5.f),      [ff_pw_16]"f"(ff_pw_16.f)
784            : "memory"
785        );
786
787        src += 4 - (h + 5) * srcStride;
788        dst += 4 - h * dstStride;
789    }
790}
791
792static void put_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
793        int dstStride, int srcStride)
794{
795    put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
796    put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
797    src += 8*srcStride;
798    dst += 8*dstStride;
799    put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
800    put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
801}
802
803static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
804        int dstStride, int srcStride)
805{
806    double ftmp[10];
807    uint64_t tmp[1];
808
809    src -= 2 * srcStride;
810
811    __asm__ volatile (
812        ".set       push                                                \n\t"
813        ".set       noreorder                                           \n\t"
814        "dli        %[tmp0],    0x02                                    \n\t"
815        "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
816        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
817        "dli        %[tmp0],    0x05                                    \n\t"
818        MMI_LWC1(%[ftmp0], %[src], 0x00)
819        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
820        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
821        MMI_LWC1(%[ftmp1], %[src], 0x00)
822        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
823        MMI_LWC1(%[ftmp2], %[src], 0x00)
824        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
825        MMI_LWC1(%[ftmp3], %[src], 0x00)
826        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
827        MMI_LWC1(%[ftmp4], %[src], 0x00)
828        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
829        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
830        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
831        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
832        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
833        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
834        MMI_LWC1(%[ftmp5], %[src], 0x00)
835        "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
836        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
837        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
838        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
839        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
840        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
841        "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]             \n\t"
842        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
843        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
844        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
845        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
846        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
847        MMI_LWC1(%[ftmp0], %[dst], 0x00)
848        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
849        MMI_SWC1(%[ftmp6], %[dst], 0x00)
850        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
851        MMI_LWC1(%[ftmp0], %[src], 0x00)
852        "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]                \n\t"
853        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
854        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
855        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
856        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
857        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
858        "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]             \n\t"
859        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
860        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
861        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
862        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
863        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
864        MMI_LWC1(%[ftmp1], %[dst], 0x00)
865        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
866        MMI_SWC1(%[ftmp6], %[dst], 0x00)
867        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
868        MMI_LWC1(%[ftmp1], %[src], 0x00)
869        "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]                \n\t"
870        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
871        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
872        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
873        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
874        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
875        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
876        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
877        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
878        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
879        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
880        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
881        MMI_LWC1(%[ftmp2], %[dst], 0x00)
882        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
883        MMI_SWC1(%[ftmp6], %[dst], 0x00)
884        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
885        MMI_LWC1(%[ftmp2], %[src], 0x00)
886        "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
887        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
888        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
889        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
890        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
891        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
892        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
893        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
894        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
895        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
896        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
897        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
898        MMI_LWC1(%[ftmp3], %[dst], 0x00)
899        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
900        MMI_SWC1(%[ftmp6], %[dst], 0x00)
901        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
902        ".set       pop                                                 \n\t"
903        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
904          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
905          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
906          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
907          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
908          [tmp0]"=&r"(tmp[0]),
909          [src]"+&r"(src),              [dst]"+&r"(dst)
910        : [dstStride]"r"((mips_reg)dstStride),
911          [srcStride]"r"((mips_reg)srcStride),
912          [ff_pw_5]"f"(ff_pw_5.f),      [ff_pw_16]"f"(ff_pw_16.f)
913        : "memory"
914    );
915}
916
917static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
918        int dstStride, int srcStride)
919{
920    int w = 2;
921    int h = 8;
922    double ftmp[10];
923    uint64_t tmp[1];
924    DECLARE_VAR_LOW32;
925
926    src -= 2 * srcStride;
927
928    while (w--) {
929        __asm__ volatile (
930            ".set       push                                            \n\t"
931            ".set       noreorder                                       \n\t"
932            "dli        %[tmp0],    0x02                                \n\t"
933            "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
934            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
935            "dli        %[tmp0],    0x05                                \n\t"
936            MMI_LWC1(%[ftmp0], %[src], 0x00)
937            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
938            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
939            MMI_LWC1(%[ftmp1], %[src], 0x00)
940            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
941            MMI_LWC1(%[ftmp2], %[src], 0x00)
942            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
943            MMI_LWC1(%[ftmp3], %[src], 0x00)
944            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
945            MMI_LWC1(%[ftmp4], %[src], 0x00)
946            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
947            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
948            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
949            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
950            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
951            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
952            MMI_LWC1(%[ftmp5], %[src], 0x00)
953            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
954            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
955            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
956            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
957            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
958            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
959            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
960            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
961            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
962            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
963            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
964            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
965            MMI_LWC1(%[ftmp0], %[dst], 0x00)
966            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
967            MMI_SWC1(%[ftmp6], %[dst], 0x00)
968            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
969            MMI_LWC1(%[ftmp0], %[src], 0x00)
970            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
971            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
972            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
973            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
974            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
975            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
976            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
977            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
978            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
979            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
980            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
981            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
982            MMI_LWC1(%[ftmp1], %[dst], 0x00)
983            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
984            MMI_SWC1(%[ftmp6], %[dst], 0x00)
985            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
986            MMI_LWC1(%[ftmp1], %[src], 0x00)
987            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
988            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
989            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
990            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
991            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
992            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
993            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
994            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
995            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
996            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
997            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
998            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
999            MMI_LWC1(%[ftmp2], %[dst], 0x00)
1000            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1001            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1002            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1003            MMI_LWC1(%[ftmp2], %[src], 0x00)
1004            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1005            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1006            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1007            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1008            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1009            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1010            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1011            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1012            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1013            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1014            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1015            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1016            MMI_LWC1(%[ftmp3], %[dst], 0x00)
1017            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1018            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1019            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1020            MMI_LWC1(%[ftmp3], %[src], 0x00)
1021            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
1022            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1023            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1024            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1025            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1026            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1027            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
1028            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1029            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1030            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1031            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1032            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1033            MMI_LWC1(%[ftmp4], %[dst], 0x00)
1034            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1035            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1036            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1037            MMI_LWC1(%[ftmp4], %[src], 0x00)
1038            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
1039            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1040            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1041            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1042            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1043            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1044            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
1045            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1046            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
1047            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1048            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1049            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1050            MMI_LWC1(%[ftmp5], %[dst], 0x00)
1051            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1052            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1053            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1054            MMI_LWC1(%[ftmp5], %[src], 0x00)
1055            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1056            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1057            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1058            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1059            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1060            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1061            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1062            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1063            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1064            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1065            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1066            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1067            MMI_LWC1(%[ftmp0], %[dst], 0x00)
1068            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1069            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1070            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1071            MMI_LWC1(%[ftmp0], %[src], 0x00)
1072            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1073            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1074            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1075            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1076            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1077            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1078            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1079            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1080            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1081            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1082            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1083            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1084            MMI_LWC1(%[ftmp1], %[dst], 0x00)
1085            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1086            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1087            "bne        %[h],       0x10,           2f                  \n\t"
1088            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1089            MMI_LWC1(%[ftmp1], %[src], 0x00)
1090            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1091            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1092            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1093            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1094            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1095            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1096            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1097            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1098            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1099            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1100            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1101            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1102            MMI_LWC1(%[ftmp2], %[dst], 0x00)
1103            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1104            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1105            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1106            MMI_LWC1(%[ftmp2], %[src], 0x00)
1107            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1108            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1109            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1110            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1111            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1112            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1113            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1114            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1115            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1116            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1117            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1118            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1119            MMI_LWC1(%[ftmp3], %[dst], 0x00)
1120            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1121            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1122            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1123            MMI_LWC1(%[ftmp3], %[src], 0x00)
1124            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
1125            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1126            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1127            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1128            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1129            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1130            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
1131            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1132            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1133            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1134            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1135            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1136            MMI_LWC1(%[ftmp4], %[dst], 0x00)
1137            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1138            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1139            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1140            MMI_LWC1(%[ftmp4], %[src], 0x00)
1141            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
1142            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1143            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1144            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1145            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1146            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1147            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
1148            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1149            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
1150            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1151            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1152            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1153            MMI_LWC1(%[ftmp5], %[dst], 0x00)
1154            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1155            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1156            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1157            MMI_LWC1(%[ftmp5], %[src], 0x00)
1158            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1159            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1160            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1161            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1162            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1163            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1164            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1165            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1166            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1167            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1168            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1169            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1170            MMI_LWC1(%[ftmp0], %[dst], 0x00)
1171            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1172            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1173            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1174            MMI_LWC1(%[ftmp0], %[src], 0x00)
1175            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1176            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1177            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1178            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1179            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1180            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1181            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1182            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1183            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1184            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1185            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1186            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1187            MMI_LWC1(%[ftmp1], %[dst], 0x00)
1188            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1189            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1190            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1191            MMI_LWC1(%[ftmp1], %[src], 0x00)
1192            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1193            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1194            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1195            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1196            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1197            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1198            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1199            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1200            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1201            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1202            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1203            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1204            MMI_LWC1(%[ftmp2], %[dst], 0x00)
1205            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1206            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1207            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1208            MMI_LWC1(%[ftmp2], %[src], 0x00)
1209            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1210            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1211            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1212            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1213            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1214            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1215            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1216            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1217            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1218            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1219            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1220            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1221            MMI_LWC1(%[ftmp3], %[dst], 0x00)
1222            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1223            MMI_SWC1(%[ftmp6], %[dst], 0x00)
1224            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1225            "2:                                                         \n\t"
1226            ".set       pop                                             \n\t"
1227            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1228              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1229              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1230              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1231              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1232              [tmp0]"=&r"(tmp[0]),
1233              RESTRICT_ASM_LOW32
1234              [src]"+&r"(src),              [dst]"+&r"(dst),
1235              [h]"+&r"(h)
1236            : [dstStride]"r"((mips_reg)dstStride),
1237              [srcStride]"r"((mips_reg)srcStride),
1238              [ff_pw_5]"f"(ff_pw_5.f),      [ff_pw_16]"f"(ff_pw_16.f)
1239            : "memory"
1240        );
1241
1242        src += 4 - (h + 5) * srcStride;
1243        dst += 4 - h * dstStride;
1244    }
1245}
1246
1247static void avg_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1248        int dstStride, int srcStride)
1249{
1250    avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
1251    avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1252    src += 8*srcStride;
1253    dst += 8*dstStride;
1254    avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
1255    avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1256}
1257
1258static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1259        int dstStride, int srcStride)
1260{
1261    INIT_CLIP
1262    int i;
1263    int16_t _tmp[36];
1264    int16_t *tmp = _tmp;
1265    double ftmp[10];
1266    uint64_t tmp0;
1267    DECLARE_VAR_LOW32;
1268
1269    src -= 2*srcStride;
1270
1271    __asm__ volatile (
1272        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1273        "dli        %[tmp0],    0x09                                    \n\t"
1274        "1:                                                             \n\t"
1275        MMI_ULWC1(%[ftmp1], %[src], -0x02)
1276        MMI_ULWC1(%[ftmp2], %[src], -0x01)
1277        MMI_ULWC1(%[ftmp3], %[src],  0x00)
1278        MMI_ULWC1(%[ftmp4], %[src],  0x01)
1279        MMI_ULWC1(%[ftmp5], %[src],  0x02)
1280        MMI_ULWC1(%[ftmp6], %[src],  0x03)
1281        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1282        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1283        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1284        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1285        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1286        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1287        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
1288        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
1289        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
1290        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
1291        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
1292        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1293        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
1294        MMI_SDC1(%[ftmp9], %[tmp], 0x00)
1295        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
1296        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
1297        PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
1298        "bnez       %[tmp0],    1b                                      \n\t"
1299        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1300          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1301          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1302          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1303          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1304          [tmp0]"=&r"(tmp0),
1305          RESTRICT_ASM_LOW32
1306          [tmp]"+&r"(tmp),                  [src]"+&r"(src)
1307        : [tmpStride]"r"(8),
1308          [srcStride]"r"((mips_reg)srcStride),
1309          [ff_pw_20]"f"(ff_pw_20.f),        [ff_pw_5]"f"(ff_pw_5.f)
1310        : "memory"
1311    );
1312
1313    tmp -= 28;
1314
1315    for (i=0; i<4; i++) {
1316        const int16_t tmpB= tmp[-8];
1317        const int16_t tmpA= tmp[-4];
1318        const int16_t tmp0= tmp[ 0];
1319        const int16_t tmp1= tmp[ 4];
1320        const int16_t tmp2= tmp[ 8];
1321        const int16_t tmp3= tmp[12];
1322        const int16_t tmp4= tmp[16];
1323        const int16_t tmp5= tmp[20];
1324        const int16_t tmp6= tmp[24];
1325        op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
1326        op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
1327        op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
1328        op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
1329        dst++;
1330        tmp++;
1331    }
1332}
1333
1334static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
1335        const uint8_t *src, ptrdiff_t tmpStride, ptrdiff_t srcStride, int size)
1336{
1337    int w = (size + 8) >> 2;
1338    double ftmp[11];
1339    uint64_t tmp0;
1340    DECLARE_VAR_LOW32;
1341
1342    src -= 2 * srcStride + 2;
1343
1344    while (w--) {
1345        __asm__ volatile (
1346            "dli        %[tmp0],    0x02                                \n\t"
1347            MMI_ULWC1(%[ftmp0], %[src], 0x00)
1348            "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
1349            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1350            "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
1351            MMI_ULWC1(%[ftmp1], %[src], 0x00)
1352            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1353            MMI_ULWC1(%[ftmp2], %[src], 0x00)
1354            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1355            MMI_ULWC1(%[ftmp3], %[src], 0x00)
1356            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1357            MMI_ULWC1(%[ftmp4], %[src], 0x00)
1358            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1359            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1360            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1361            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1362            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1363            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1364            MMI_ULWC1(%[ftmp5], %[src], 0x00)
1365            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1366            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1367            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1368            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1369            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1370            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1371            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1372            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1373            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1374            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1375            MMI_SDC1(%[ftmp6], %[tmp], 0x00)
1376            MMI_ULWC1(%[ftmp0], %[src], 0x00)
1377            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1378            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1379            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1380            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1381            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1382            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1383            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1384            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1385            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1386            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1387            MMI_SDC1(%[ftmp6], %[tmp], 0x30)
1388            MMI_ULWC1(%[ftmp1], %[src], 0x00)
1389            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1390            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1391            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1392            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1393            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1394            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1395            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1396            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1397            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1398            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1399            MMI_SDC1(%[ftmp6], %[tmp], 0x60)
1400            MMI_ULWC1(%[ftmp2], %[src], 0x00)
1401            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1402            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1403            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1404            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1405            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1406            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1407            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1408            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1409            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1410            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1411            MMI_SDC1(%[ftmp6], %[tmp], 0x90)
1412            MMI_ULWC1(%[ftmp3], %[src], 0x00)
1413            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
1414            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1415            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
1416            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1417            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1418            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1419            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1420            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1421            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1422            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1423            MMI_SDC1(%[ftmp6], %[tmp], 0xc0)
1424            MMI_ULWC1(%[ftmp4], %[src], 0x00)
1425            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
1426            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1427            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
1428            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1429            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1430            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1431            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1432            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
1433            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1434            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1435            MMI_SDC1(%[ftmp6], %[tmp], 0xf0)
1436            MMI_ULWC1(%[ftmp5], %[src], 0x00)
1437            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1438            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1439            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1440            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1441            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1442            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1443            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1444            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1445            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1446            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1447            MMI_SDC1(%[ftmp6], %[tmp], 0x120)
1448            MMI_ULWC1(%[ftmp0], %[src], 0x00)
1449            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1450            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1451            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1452            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1453            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1454            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1455            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1456            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1457            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1458            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1459            MMI_SDC1(%[ftmp6], %[tmp], 0x150)
1460            "bne        %[size],    0x10,           2f                  \n\t"
1461
1462            MMI_ULWC1(%[ftmp1], %[src], 0x00)
1463            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1464            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1465            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1466            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1467            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1468            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1469            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1470            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1471            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1472            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1473            MMI_SDC1(%[ftmp6], %[tmp], 0x180)
1474            MMI_ULWC1(%[ftmp2], %[src], 0x00)
1475            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1476            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1477            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1478            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1479            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1480            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1481            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1482            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1483            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1484            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1485            MMI_SDC1(%[ftmp6], %[tmp], 0x1b0)
1486            MMI_ULWC1(%[ftmp3], %[src], 0x00)
1487            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
1488            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1489            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
1490            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1491            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1492            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1493            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1494            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1495            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1496            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1497            MMI_SDC1(%[ftmp6], %[tmp], 0x1e0)
1498            MMI_ULWC1(%[ftmp4], %[src], 0x00)
1499            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
1500            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1501            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
1502            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1503            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1504            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1505            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1506            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
1507            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1508            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1509            MMI_SDC1(%[ftmp6], %[tmp], 0x210)
1510            MMI_ULWC1(%[ftmp5], %[src], 0x00)
1511            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1512            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1513            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1514            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1515            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1516            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1517            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1518            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1519            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1520            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1521            MMI_SDC1(%[ftmp6], %[tmp], 0x240)
1522            MMI_ULWC1(%[ftmp0], %[src], 0x00)
1523            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1524            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1525            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1526            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1527            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1528            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1529            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1530            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1531            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1532            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1533            MMI_SDC1(%[ftmp6], %[tmp], 0x270)
1534            MMI_ULWC1(%[ftmp1], %[src], 0x00)
1535            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1536            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1537            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1538            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1539            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1540            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1541            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1542            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1543            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1544            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1545            MMI_SDC1(%[ftmp6], %[tmp], 0x2a0)
1546            MMI_ULWC1(%[ftmp2], %[src], 0x00)
1547            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1548            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1549            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1550            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1551            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1552            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1553            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1554            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1555            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1556            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1557            MMI_SDC1(%[ftmp6], %[tmp], 0x2d0)
1558            "2:                                                         \n\t"
1559            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1560              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1561              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1562              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1563              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1564              [ftmp10]"=&f"(ftmp[10]),
1565              [tmp0]"=&r"(tmp0),
1566              RESTRICT_ASM_LOW32
1567              [src]"+&r"(src)
1568            : [tmp]"r"(tmp),                [size]"r"(size),
1569              [srcStride]"r"((mips_reg)srcStride),
1570              [ff_pw_5]"f"(ff_pw_5.f),      [ff_pw_16]"f"(ff_pw_16.f)
1571            : "memory"
1572        );
1573
1574        tmp += 4;
1575        src += 4 - (size + 5) * srcStride;
1576    }
1577}
1578
1579static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
1580        int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size)
1581{
1582    int w = size >> 4;
1583    double ftmp[10];
1584    uint64_t tmp0;
1585    DECLARE_VAR_ALL64;
1586
1587    do {
1588        int h = size;
1589
1590        __asm__ volatile (
1591            "dli        %[tmp0],    0x02                                \n\t"
1592            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
1593            "dli        %[tmp0],    0x06                                \n\t"
1594            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
1595            "1:                                                         \n\t"
1596            MMI_LDC1(%[ftmp0], %[tmp], 0x00)
1597            MMI_LDC1(%[ftmp3], %[tmp], 0x08)
1598            MMI_LDC1(%[ftmp6], %[tmp], 0x10)
1599            MMI_ULDC1(%[ftmp1], %[tmp], 0x02)
1600            MMI_ULDC1(%[ftmp4], %[tmp], 0x0a)
1601            MMI_ULDC1(%[ftmp5], %[tmp], 0x12)
1602            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
1603            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1604            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1605            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
1606            MMI_ULDC1(%[ftmp2], %[tmp], 0x04)
1607            MMI_ULDC1(%[ftmp6], %[tmp], 0x06)
1608            MMI_ULDC1(%[ftmp5], %[tmp], 0x0c)
1609            MMI_ULDC1(%[ftmp7], %[tmp], 0x0e)
1610            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
1611            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1612            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
1613            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
1614            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp8]            \n\t"
1615            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
1616            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
1617            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
1618            "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
1619            "paddsh     %[ftmp3] ,  %[ftmp3],       %[ftmp5]            \n\t"
1620            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp8]            \n\t"
1621            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
1622            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
1623            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1624            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
1625            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
1626            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
1627            "addi       %[h],       %[h],           -0x01               \n\t"
1628            MMI_SDC1(%[ftmp0], %[dst], 0x00)
1629            PTR_ADDIU  "%[tmp],     %[tmp],         0x30                \n\t"
1630            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1631            "bnez       %[h],       1b                                  \n\t"
1632            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1633              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1634              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1635              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1636              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1637              [tmp0]"=&r"(tmp0),
1638              RESTRICT_ASM_ALL64
1639              [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
1640              [h]"+&r"(h)
1641            : [dstStride]"r"((mips_reg)dstStride)
1642            : "memory"
1643        );
1644
1645        tmp += 8 - size * 24;
1646        dst += 8 - size * dstStride;
1647    } while (w--);
1648}
1649
1650static void put_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1651        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1652        ptrdiff_t srcStride, int size)
1653{
1654    put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size);
1655    put_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size);
1656}
1657
1658static void put_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1659        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1660        ptrdiff_t srcStride)
1661{
1662    put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
1663            srcStride, 8);
1664}
1665
1666static void put_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1667        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1668        ptrdiff_t srcStride)
1669{
1670    put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
1671            srcStride, 16);
1672}
1673
1674static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
1675        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
1676{
1677    int h = 8;
1678    double ftmp[9];
1679    uint64_t tmp[1];
1680    DECLARE_VAR_LOW32;
1681    DECLARE_VAR_ALL64;
1682
1683    __asm__ volatile (
1684        "dli        %[tmp0],    0x02                                    \n\t"
1685        "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
1686        "dli        %[tmp0],    0x05                                    \n\t"
1687        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1688        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
1689        "1:                                                             \n\t"
1690        MMI_ULDC1(%[ftmp1], %[src], 0x00)
1691        MMI_ULDC1(%[ftmp3], %[src], 0x01)
1692        "punpckhbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
1693        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1694        "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
1695        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1696        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
1697        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
1698        "psllh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
1699        "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
1700        MMI_ULDC1(%[ftmp3], %[src], -0x01)
1701        MMI_ULDC1(%[ftmp5], %[src],  0x02)
1702        "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
1703        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1704        "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
1705        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1706        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
1707        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
1708        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
1709        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
1710        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
1711        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
1712        MMI_ULWC1(%[ftmp3], %[src], -0x02)
1713        MMI_ULWC1(%[ftmp6], %[src], 0x07)
1714        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1715        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1716        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
1717        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1718        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
1719        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]             \n\t"
1720        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
1721        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
1722        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
1723        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
1724        MMI_LDC1(%[ftmp5], %[src2],  0x00)
1725        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
1726        PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
1727        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
1728        PTR_ADDU   "%[h],       %[h],           -0x01                   \n\t"
1729        MMI_SDC1(%[ftmp1], %[dst], 0x00)
1730        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
1731        PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
1732        "bgtz       %[h],       1b                                      \n\t"
1733        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1734          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1735          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1736          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1737          [ftmp8]"=&f"(ftmp[8]),
1738          [tmp0]"=&r"(tmp[0]),
1739          RESTRICT_ASM_LOW32
1740          RESTRICT_ASM_ALL64
1741          [src]"+&r"(src),                  [dst]"+&r"(dst),
1742          [src2]"+&r"(src2),                [h]"+&r"(h)
1743        : [src2Stride]"r"((mips_reg)src2Stride),
1744          [dstStride]"r"((mips_reg)dstStride),
1745          [ff_pw_5]"f"(ff_pw_5.f),          [ff_pw_16]"f"(ff_pw_16.f)
1746        : "memory"
1747    );
1748}
1749
1750static void put_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
1751        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h)
1752{
1753    double ftmp[7];
1754    uint64_t tmp0;
1755    DECLARE_VAR_ALL64;
1756    DECLARE_VAR_ADDRT;
1757
1758    do {
1759        __asm__ volatile (
1760            "dli        %[tmp0],    0x05                                \n\t"
1761            MMI_ULDC1(%[ftmp0], %[src16], 0x00)
1762            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
1763            MMI_ULDC1(%[ftmp1], %[src16], 0x08)
1764            MMI_ULDC1(%[ftmp2], %[src16], 0x30)
1765            MMI_ULDC1(%[ftmp3], %[src16], 0x38)
1766            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
1767            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
1768            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
1769            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
1770            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
1771            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
1772            MMI_LDC1(%[ftmp5], %[src8], 0x00)
1773            MMI_LDXC1(%[ftmp4], %[src8], %[src8Stride], 0x00)
1774            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1775            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
1776            MMI_SDC1(%[ftmp0], %[dst], 0x00)
1777            MMI_SDXC1(%[ftmp2], %[dst], %[dstStride], 0x00)
1778            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1779              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1780              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1781              [ftmp6]"=&f"(ftmp[6]),
1782              RESTRICT_ASM_ALL64
1783              RESTRICT_ASM_ADDRT
1784              [tmp0]"=&r"(tmp0)
1785            : [src8]"r"(src8),              [src16]"r"(src16),
1786              [dst]"r"(dst),
1787              [src8Stride]"r"((mips_reg)src8Stride),
1788              [dstStride]"r"((mips_reg)dstStride)
1789            : "memory"
1790        );
1791
1792        src8  += 2 * src8Stride;
1793        src16 += 48;
1794        dst   += 2 * dstStride;
1795    } while (h -= 2);
1796}
1797
1798static void put_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
1799        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
1800{
1801    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
1802    put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
1803            src2Stride);
1804
1805    src += 8 * dstStride;
1806    dst += 8 * dstStride;
1807    src2 += 8 * src2Stride;
1808
1809    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
1810    put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
1811            src2Stride);
1812}
1813
1814static void put_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
1815        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h)
1816{
1817    put_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, h);
1818    put_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride,
1819            src8Stride, h);
1820}
1821
1822static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1823        int dstStride, int srcStride)
1824{
1825    INIT_CLIP
1826    int i;
1827    int16_t _tmp[36];
1828    int16_t *tmp = _tmp;
1829    double ftmp[10];
1830    uint64_t tmp0;
1831    DECLARE_VAR_LOW32;
1832
1833    src -= 2*srcStride;
1834
1835    __asm__ volatile (
1836        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1837        "dli        %[tmp0],    0x09                                    \n\t"
1838        "1:                                                             \n\t"
1839        MMI_ULWC1(%[ftmp1], %[src], -0x02)
1840        MMI_ULWC1(%[ftmp2], %[src], -0x01)
1841        MMI_ULWC1(%[ftmp3], %[src],  0x00)
1842        MMI_ULWC1(%[ftmp4], %[src],  0x01)
1843        MMI_ULWC1(%[ftmp5], %[src],  0x02)
1844        MMI_ULWC1(%[ftmp6], %[src],  0x03)
1845        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1846        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1847        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1848        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1849        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1850        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1851        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
1852        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
1853        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
1854        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
1855        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
1856        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1857        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
1858        MMI_SDC1(%[ftmp9], %[tmp], 0x00)
1859        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
1860        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
1861        PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
1862        "bnez       %[tmp0],    1b                                      \n\t"
1863        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1864          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1865          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1866          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1867          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1868          [tmp0]"=&r"(tmp0),
1869          RESTRICT_ASM_LOW32
1870          [tmp]"+&r"(tmp),                  [src]"+&r"(src)
1871        : [tmpStride]"r"(8),
1872          [srcStride]"r"((mips_reg)srcStride),
1873          [ff_pw_20]"f"(ff_pw_20.f),        [ff_pw_5]"f"(ff_pw_5.f)
1874        : "memory"
1875    );
1876
1877    tmp -= 28;
1878
1879    for (i=0; i<4; i++) {
1880        const int16_t tmpB= tmp[-8];
1881        const int16_t tmpA= tmp[-4];
1882        const int16_t tmp0= tmp[ 0];
1883        const int16_t tmp1= tmp[ 4];
1884        const int16_t tmp2= tmp[ 8];
1885        const int16_t tmp3= tmp[12];
1886        const int16_t tmp4= tmp[16];
1887        const int16_t tmp5= tmp[20];
1888        const int16_t tmp6= tmp[24];
1889        op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
1890        op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
1891        op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
1892        op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
1893        dst++;
1894        tmp++;
1895    }
1896}
1897
1898static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
1899        int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size)
1900{
1901    int w = size >> 4;
1902    double ftmp[11];
1903    uint64_t tmp0;
1904    DECLARE_VAR_ALL64;
1905
1906    do {
1907        int h = size;
1908        __asm__ volatile (
1909            "dli        %[tmp0],    0x02                                \n\t"
1910            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
1911            "dli        %[tmp0],    0x06                                \n\t"
1912            "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
1913            "1:                                                         \n\t"
1914            MMI_LDC1(%[ftmp0], %[tmp], 0x00)
1915            MMI_LDC1(%[ftmp3], %[tmp], 0x08)
1916            MMI_ULDC1(%[ftmp1], %[tmp], 0x02)
1917            MMI_ULDC1(%[ftmp4], %[tmp], 0x0a)
1918            MMI_LDC1(%[ftmp7], %[tmp], 0x10)
1919            MMI_ULDC1(%[ftmp8], %[tmp], 0x12)
1920            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
1921            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1922            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
1923            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1924            MMI_ULDC1(%[ftmp2], %[tmp], 0x04)
1925            MMI_ULDC1(%[ftmp5], %[tmp], 0x0c)
1926            MMI_ULDC1(%[ftmp7], %[tmp], 0x06)
1927            MMI_ULDC1(%[ftmp8], %[tmp], 0x0e)
1928            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1929            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp8]            \n\t"
1930            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
1931            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
1932            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
1933            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
1934            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
1935            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
1936            "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
1937            "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1938            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
1939            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
1940            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
1941            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1942            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp10]           \n\t"
1943            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
1944            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
1945            MMI_LDC1(%[ftmp6], %[dst], 0x00)
1946            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
1947            MMI_SDC1(%[ftmp0], %[dst], 0x00)
1948            "addi       %[h],       %[h],           -0x01               \n\t"
1949            PTR_ADDI   "%[tmp],     %[tmp],         0x30                \n\t"
1950            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1951            "bnez       %[h],       1b                                  \n\t"
1952            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1953              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1954              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1955              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1956              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1957              [ftmp10]"=&f"(ftmp[10]),
1958              [tmp0]"=&r"(tmp0),
1959              RESTRICT_ASM_ALL64
1960              [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
1961              [h]"+&r"(h)
1962            : [dstStride]"r"((mips_reg)dstStride)
1963            : "memory"
1964        );
1965
1966        tmp += 8 - size * 24;
1967        dst += 8 - size * dstStride;
1968    } while (w--);
1969}
1970
1971static void avg_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1972        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1973        ptrdiff_t srcStride, int size)
1974{
1975    put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size);
1976    avg_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size);
1977}
1978
1979static void avg_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1980        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1981        ptrdiff_t srcStride)
1982{
1983    avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
1984            srcStride, 8);
1985}
1986
1987static void avg_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1988        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1989        ptrdiff_t srcStride)
1990{
1991    avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
1992            srcStride, 16);
1993}
1994
1995static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
1996        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
1997{
1998    double ftmp[10];
1999    uint64_t tmp[2];
2000    DECLARE_VAR_LOW32;
2001    DECLARE_VAR_ALL64;
2002
2003    __asm__ volatile (
2004        "dli        %[tmp1],    0x02                                    \n\t"
2005        "ori        %[tmp0],    $0,             0x8                     \n\t"
2006        "mtc1       %[tmp1],    %[ftmp7]                                \n\t"
2007        "dli        %[tmp1],    0x05                                    \n\t"
2008        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
2009        "mtc1       %[tmp1],    %[ftmp8]                                \n\t"
2010        "1:                                                             \n\t"
2011        MMI_ULDC1(%[ftmp1], %[src], 0x00)
2012        MMI_ULDC1(%[ftmp2], %[src], 0x01)
2013        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
2014        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
2015        "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
2016        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
2017        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
2018        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
2019        "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
2020        "psllh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
2021        MMI_ULDC1(%[ftmp2], %[src], -0x01)
2022        MMI_ULDC1(%[ftmp5], %[src],  0x02)
2023        "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
2024        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
2025        "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
2026        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
2027        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
2028        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
2029        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
2030        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
2031        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
2032        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
2033        MMI_ULWC1(%[ftmp2], %[src], -0x02)
2034        MMI_ULWC1(%[ftmp6], %[src],  0x07)
2035        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
2036        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
2037        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
2038        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
2039        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
2040        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]             \n\t"
2041        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
2042        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
2043        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
2044        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
2045        MMI_LDC1(%[ftmp5], %[src2], 0x00)
2046        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
2047        MMI_LDC1(%[ftmp9], %[dst], 0x00)
2048        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
2049        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
2050        PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
2051        MMI_SDC1(%[ftmp1], %[dst], 0x00)
2052        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
2053        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
2054        PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
2055        "bgtz       %[tmp0],    1b                                      \n\t"
2056        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2057          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2058          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2059          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2060          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
2061          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
2062          RESTRICT_ASM_LOW32
2063          RESTRICT_ASM_ALL64
2064          [dst]"+&r"(dst),                  [src]"+&r"(src),
2065          [src2]"+&r"(src2)
2066        : [dstStride]"r"((mips_reg)dstStride),
2067          [src2Stride]"r"((mips_reg)src2Stride),
2068          [ff_pw_5]"f"(ff_pw_5.f),          [ff_pw_16]"f"(ff_pw_16.f)
2069        : "memory"
2070    );
2071}
2072
2073static void avg_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
2074        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
2075{
2076    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
2077    avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
2078            src2Stride);
2079
2080    src += 8 * dstStride;
2081    dst += 8 * dstStride;
2082    src2 += 8 * src2Stride;
2083
2084    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
2085    avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
2086            src2Stride);
2087}
2088
2089static void avg_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
2090        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b)
2091{
2092    double ftmp[8];
2093    uint64_t tmp0;
2094    DECLARE_VAR_ALL64;
2095    DECLARE_VAR_ADDRT;
2096
2097    do {
2098        __asm__ volatile (
2099            "dli        %[tmp0],    0x05                                \n\t"
2100            MMI_ULDC1(%[ftmp0], %[src16], 0x00)
2101            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
2102            MMI_ULDC1(%[ftmp1], %[src16], 0x08)
2103            MMI_ULDC1(%[ftmp2], %[src16], 0x30)
2104            MMI_ULDC1(%[ftmp3], %[src16], 0x38)
2105            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
2106            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
2107            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
2108            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
2109            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
2110            MMI_LDC1(%[ftmp4], %[src8], 0x00)
2111            MMI_LDXC1(%[ftmp5], %[src8], %[src8Stride], 0x00)
2112            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
2113            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
2114            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
2115            MMI_LDC1(%[ftmp7], %[dst], 0x00)
2116            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
2117            MMI_SDC1(%[ftmp0], %[dst], 0x00)
2118            MMI_LDXC1(%[ftmp7], %[dst], %[dstStride], 0x00)
2119            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
2120            MMI_SDXC1(%[ftmp2], %[dst], %[dstStride], 0x00)
2121            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2122              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2123              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2124              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2125              RESTRICT_ASM_ALL64
2126              RESTRICT_ASM_ADDRT
2127              [tmp0]"=&r"(tmp0)
2128            : [src8]"r"(src8),              [src16]"r"(src16),
2129              [dst]"r"(dst),
2130              [src8Stride]"r"((mips_reg)src8Stride),
2131              [dstStride]"r"((mips_reg)dstStride)
2132            : "memory"
2133        );
2134
2135        src8  += 2 * src8Stride;
2136        src16 += 48;
2137        dst   += 2 * dstStride;
2138    } while (b -= 2);
2139}
2140
2141static void avg_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
2142        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b)
2143{
2144    avg_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, b);
2145    avg_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride,
2146            src8Stride, b);
2147}
2148
2149//DEF_H264_MC_MMI(put_, 4)
2150void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
2151        ptrdiff_t stride)
2152{
2153    ff_put_pixels4_8_mmi(dst, src, stride, 4);
2154}
2155
2156void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
2157        ptrdiff_t stride)
2158{
2159    uint8_t half[16];
2160    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
2161    ff_put_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4);
2162}
2163
2164void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
2165        ptrdiff_t stride)
2166{
2167    put_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
2168}
2169
2170void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
2171        ptrdiff_t stride)
2172{
2173    uint8_t half[16];
2174    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
2175    ff_put_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4);
2176}
2177
2178void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
2179        ptrdiff_t stride)
2180{
2181    uint8_t full[36];
2182    uint8_t * const full_mid= full + 8;
2183    uint8_t half[16];
2184    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2185    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
2186    ff_put_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4);
2187}
2188
2189void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
2190        ptrdiff_t stride)
2191{
2192    uint8_t full[36];
2193    uint8_t * const full_mid= full + 8;
2194    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2195    put_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
2196}
2197
2198void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
2199        ptrdiff_t stride)
2200{
2201    uint8_t full[36];
2202    uint8_t * const full_mid= full + 8;
2203    uint8_t half[16];
2204    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2205    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
2206    ff_put_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
2207}
2208
2209void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
2210        ptrdiff_t stride)
2211{
2212    uint8_t full[36];
2213    uint8_t * const full_mid= full + 8;
2214    uint8_t halfH[16];
2215    uint8_t halfV[16];
2216    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2217    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2218    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2219    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2220}
2221
2222void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
2223        ptrdiff_t stride)
2224{
2225    uint8_t full[36];
2226    uint8_t * const full_mid= full + 8;
2227    uint8_t halfH[16];
2228    uint8_t halfV[16];
2229    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2230    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2231    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2232    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2233}
2234
2235void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
2236        ptrdiff_t stride)
2237{
2238    uint8_t full[36];
2239    uint8_t * const full_mid= full + 8;
2240    uint8_t halfH[16];
2241    uint8_t halfV[16];
2242    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2243    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2244    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2245    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2246}
2247
2248void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
2249        ptrdiff_t stride)
2250{
2251    uint8_t full[36];
2252    uint8_t * const full_mid= full + 8;
2253    uint8_t halfH[16];
2254    uint8_t halfV[16];
2255    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2256    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2257    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2258    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2259}
2260
2261void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
2262        ptrdiff_t stride)
2263{
2264    put_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
2265}
2266
2267void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
2268        ptrdiff_t stride)
2269{
2270    uint8_t halfH[16];
2271    uint8_t halfHV[16];
2272    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2273    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2274    ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
2275}
2276
2277void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
2278        ptrdiff_t stride)
2279{
2280    uint8_t halfH[16];
2281    uint8_t halfHV[16];
2282    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2283    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2284    ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
2285}
2286
2287void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
2288        ptrdiff_t stride)
2289{
2290    uint8_t full[36];
2291    uint8_t * const full_mid= full + 8;
2292    uint8_t halfV[16];
2293    uint8_t halfHV[16];
2294    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2295    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2296    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2297    ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
2298}
2299
2300void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
2301        ptrdiff_t stride)
2302{
2303    uint8_t full[36];
2304    uint8_t * const full_mid= full + 8;
2305    uint8_t halfV[16];
2306    uint8_t halfHV[16];
2307    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2308    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2309    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2310    ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
2311}
2312
2313//DEF_H264_MC_MMI(avg_, 4)
2314void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
2315        ptrdiff_t stride)
2316{
2317    ff_avg_pixels4_8_mmi(dst, src, stride, 4);
2318}
2319
2320void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
2321        ptrdiff_t stride)
2322{
2323    uint8_t half[16];
2324    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
2325    ff_avg_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4);
2326}
2327
2328void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
2329        ptrdiff_t stride)
2330{
2331    avg_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
2332}
2333
2334void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
2335        ptrdiff_t stride)
2336{
2337    uint8_t half[16];
2338    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
2339    ff_avg_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4);
2340}
2341
2342void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
2343        ptrdiff_t stride)
2344{
2345    uint8_t full[36];
2346    uint8_t * const full_mid= full + 8;
2347    uint8_t half[16];
2348    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2349    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
2350    ff_avg_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4);
2351}
2352
2353void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
2354        ptrdiff_t stride)
2355{
2356    uint8_t full[36];
2357    uint8_t * const full_mid= full + 8;
2358    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2359    avg_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
2360}
2361
2362void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
2363        ptrdiff_t stride)
2364{
2365    uint8_t full[36];
2366    uint8_t * const full_mid= full + 8;
2367    uint8_t half[16];
2368    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2369    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
2370    ff_avg_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
2371}
2372
2373void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
2374        ptrdiff_t stride)
2375{
2376    uint8_t full[36];
2377    uint8_t * const full_mid= full + 8;
2378    uint8_t halfH[16];
2379    uint8_t halfV[16];
2380    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2381    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2382    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2383    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2384}
2385
2386void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
2387        ptrdiff_t stride)
2388{
2389    uint8_t full[36];
2390    uint8_t * const full_mid= full + 8;
2391    uint8_t halfH[16];
2392    uint8_t halfV[16];
2393    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2394    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2395    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2396    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2397}
2398
2399void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
2400        ptrdiff_t stride)
2401{
2402    uint8_t full[36];
2403    uint8_t * const full_mid= full + 8;
2404    uint8_t halfH[16];
2405    uint8_t halfV[16];
2406    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2407    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2408    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2409    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2410}
2411
2412void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
2413        ptrdiff_t stride)
2414{
2415    uint8_t full[36];
2416    uint8_t * const full_mid= full + 8;
2417    uint8_t halfH[16];
2418    uint8_t halfV[16];
2419    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2420    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2421    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2422    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2423}
2424
2425void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
2426        ptrdiff_t stride)
2427{
2428    avg_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
2429}
2430
2431void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
2432        ptrdiff_t stride)
2433{
2434    uint8_t halfH[16];
2435    uint8_t halfHV[16];
2436    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2437    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2438    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
2439}
2440
2441void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
2442        ptrdiff_t stride)
2443{
2444    uint8_t halfH[16];
2445    uint8_t halfHV[16];
2446    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2447    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2448    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
2449}
2450
2451void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
2452        ptrdiff_t stride)
2453{
2454    uint8_t full[36];
2455    uint8_t * const full_mid= full + 8;
2456    uint8_t halfV[16];
2457    uint8_t halfHV[16];
2458    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2459    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2460    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2461    ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
2462}
2463
2464void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
2465        ptrdiff_t stride)
2466{
2467    uint8_t full[36];
2468    uint8_t * const full_mid= full + 8;
2469    uint8_t halfV[16];
2470    uint8_t halfHV[16];
2471    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2472    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2473    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2474    ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
2475}
2476
2477//DEF_H264_MC_MMI(put_, 8)
2478void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
2479        ptrdiff_t stride)
2480{
2481    ff_put_pixels8_8_mmi(dst, src, stride, 8);
2482}
2483
2484void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
2485        ptrdiff_t stride)
2486{
2487    uint8_t half[64];
2488    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2489    ff_put_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8);
2490}
2491
2492void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
2493        ptrdiff_t stride)
2494{
2495    put_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
2496}
2497
2498void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
2499        ptrdiff_t stride)
2500{
2501    uint8_t half[64];
2502    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2503    ff_put_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8);
2504}
2505
2506void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
2507        ptrdiff_t stride)
2508{
2509    uint8_t full[104];
2510    uint8_t * const full_mid= full + 16;
2511    uint8_t half[64];
2512    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2513    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2514    ff_put_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8);
2515}
2516
2517void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
2518        ptrdiff_t stride)
2519{
2520    uint8_t full[104];
2521    uint8_t * const full_mid= full + 16;
2522    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2523    put_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
2524}
2525
2526void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
2527        ptrdiff_t stride)
2528{
2529    uint8_t full[104];
2530    uint8_t * const full_mid= full + 16;
2531    uint8_t half[64];
2532    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2533    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2534    ff_put_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
2535}
2536
2537void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
2538        ptrdiff_t stride)
2539{
2540    uint8_t full[104];
2541    uint8_t * const full_mid= full + 16;
2542    uint8_t halfH[64];
2543    uint8_t halfV[64];
2544    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2545    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2546    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2547    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2548}
2549
2550void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
2551        ptrdiff_t stride)
2552{
2553    uint8_t full[104];
2554    uint8_t * const full_mid= full + 16;
2555    uint8_t halfH[64];
2556    uint8_t halfV[64];
2557    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2558    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
2559    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2560    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2561}
2562
2563void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
2564        ptrdiff_t stride)
2565{
2566    uint8_t full[104];
2567    uint8_t * const full_mid= full + 16;
2568    uint8_t halfH[64];
2569    uint8_t halfV[64];
2570    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2571    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2572    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2573    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2574}
2575
2576void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
2577        ptrdiff_t stride)
2578{
2579    uint8_t full[104];
2580    uint8_t * const full_mid= full + 16;
2581    uint8_t halfH[64];
2582    uint8_t halfV[64];
2583    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2584    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
2585    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2586    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2587}
2588
2589void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
2590        ptrdiff_t stride)
2591{
2592    uint16_t __attribute__ ((aligned(8))) temp[192];
2593
2594    put_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride);
2595}
2596
2597void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
2598        ptrdiff_t stride)
2599{
2600    uint8_t __attribute__ ((aligned(8))) temp[448];
2601    uint8_t *const halfHV = temp;
2602    int16_t *const halfV = (int16_t *) (temp + 64);
2603
2604    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2605    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8);
2606}
2607
2608void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
2609        ptrdiff_t stride)
2610{
2611    uint8_t __attribute__ ((aligned(8))) temp[448];
2612    uint8_t *const halfHV = temp;
2613    int16_t *const halfV = (int16_t *) (temp + 64);
2614
2615    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2616    put_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8);
2617}
2618
2619void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
2620        ptrdiff_t stride)
2621{
2622    uint8_t __attribute__ ((aligned(8))) temp[448];
2623    uint8_t *const halfHV = temp;
2624    int16_t *const halfV = (int16_t *) (temp + 64);
2625
2626    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2627    put_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8);
2628}
2629
2630void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
2631        ptrdiff_t stride)
2632{
2633    uint8_t __attribute__ ((aligned(8))) temp[448];
2634    uint8_t *const halfHV = temp;
2635    int16_t *const halfV = (int16_t *) (temp + 64);
2636
2637    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2638    put_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8);
2639}
2640
2641//DEF_H264_MC_MMI(avg_, 8)
2642void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
2643        ptrdiff_t stride)
2644{
2645    ff_avg_pixels8_8_mmi(dst, src, stride, 8);
2646}
2647
2648void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
2649        ptrdiff_t stride)
2650{
2651    uint8_t half[64];
2652    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2653    ff_avg_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8);
2654}
2655
2656void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
2657        ptrdiff_t stride)
2658{
2659    avg_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
2660}
2661
2662void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
2663        ptrdiff_t stride)
2664{
2665    uint8_t half[64];
2666    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2667    ff_avg_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8);
2668}
2669
2670void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
2671        ptrdiff_t stride)
2672{
2673    uint8_t full[104];
2674    uint8_t * const full_mid= full + 16;
2675    uint8_t half[64];
2676    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2677    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2678    ff_avg_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8);
2679}
2680
2681void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
2682        ptrdiff_t stride)
2683{
2684    uint8_t full[104];
2685    uint8_t * const full_mid= full + 16;
2686    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2687    avg_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
2688}
2689
2690void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
2691        ptrdiff_t stride)
2692{
2693    uint8_t full[104];
2694    uint8_t * const full_mid= full + 16;
2695    uint8_t half[64];
2696    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2697    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2698    ff_avg_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
2699}
2700
2701void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
2702        ptrdiff_t stride)
2703{
2704    uint8_t full[104];
2705    uint8_t * const full_mid= full + 16;
2706    uint8_t halfH[64];
2707    uint8_t halfV[64];
2708    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2709    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2710    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2711    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2712}
2713
2714void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
2715        ptrdiff_t stride)
2716{
2717    uint8_t full[104];
2718    uint8_t * const full_mid= full + 16;
2719    uint8_t halfH[64];
2720    uint8_t halfV[64];
2721    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2722    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
2723    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2724    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2725}
2726
2727void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
2728        ptrdiff_t stride)
2729{
2730    uint8_t full[104];
2731    uint8_t * const full_mid= full + 16;
2732    uint8_t halfH[64];
2733    uint8_t halfV[64];
2734    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2735    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2736    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2737    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2738}
2739
2740void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
2741        ptrdiff_t stride)
2742{
2743    uint8_t full[104];
2744    uint8_t * const full_mid= full + 16;
2745    uint8_t halfH[64];
2746    uint8_t halfV[64];
2747    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2748    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
2749    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2750    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2751}
2752
2753void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
2754        ptrdiff_t stride)
2755{
2756    uint16_t __attribute__ ((aligned(8))) temp[192];
2757
2758    avg_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride);
2759}
2760
2761void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
2762        ptrdiff_t stride)
2763{
2764    uint8_t __attribute__ ((aligned(8))) temp[448];
2765    uint8_t *const halfHV = temp;
2766    int16_t *const halfV = (int16_t *) (temp + 64);
2767
2768    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2769    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8);
2770}
2771
2772void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
2773        ptrdiff_t stride)
2774{
2775    uint8_t __attribute__ ((aligned(8))) temp[448];
2776    uint8_t *const halfHV = temp;
2777    int16_t *const halfV = (int16_t *) (temp + 64);
2778
2779    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2780    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8);
2781}
2782
2783void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
2784        ptrdiff_t stride)
2785{
2786    uint8_t __attribute__ ((aligned(8))) temp[448];
2787    uint8_t *const halfHV = temp;
2788    int16_t *const halfV = (int16_t *) (temp + 64);
2789
2790    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2791    avg_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8);
2792}
2793
2794void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
2795        ptrdiff_t stride)
2796{
2797    uint8_t __attribute__ ((aligned(8))) temp[448];
2798    uint8_t *const halfHV = temp;
2799    int16_t *const halfV = (int16_t *) (temp + 64);
2800
2801    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2802    avg_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8);
2803}
2804
2805//DEF_H264_MC_MMI(put_, 16)
2806void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
2807        ptrdiff_t stride)
2808{
2809    ff_put_pixels16_8_mmi(dst, src, stride, 16);
2810}
2811
2812void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
2813        ptrdiff_t stride)
2814{
2815    uint8_t half[256];
2816    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2817    ff_put_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16);
2818}
2819
2820void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
2821        ptrdiff_t stride)
2822{
2823    put_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
2824}
2825
2826void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
2827        ptrdiff_t stride)
2828{
2829    uint8_t half[256];
2830    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2831    ff_put_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16);
2832}
2833
2834void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
2835        ptrdiff_t stride)
2836{
2837    uint8_t full[336];
2838    uint8_t * const full_mid= full + 32;
2839    uint8_t half[256];
2840    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
2841    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
2842    ff_put_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16);
2843}
2844
2845void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
2846        ptrdiff_t stride)
2847{
2848    uint8_t full[336];
2849    uint8_t * const full_mid= full + 32;
2850    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
2851    put_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
2852}
2853
2854void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
2855        ptrdiff_t stride)
2856{
2857    uint8_t full[336];
2858    uint8_t * const full_mid= full + 32;
2859    uint8_t half[256];
2860    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
2861    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
2862    ff_put_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
2863}
2864
2865void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
2866        ptrdiff_t stride)
2867{
2868    uint8_t full[336];
2869    uint8_t * const full_mid= full + 32;
2870    uint8_t halfH[256];
2871    uint8_t halfV[256];
2872    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
2873    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
2874    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2875    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2876}
2877
2878void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
2879        ptrdiff_t stride)
2880{
2881    uint8_t full[336];
2882    uint8_t * const full_mid= full + 32;
2883    uint8_t halfH[256];
2884    uint8_t halfV[256];
2885    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
2886    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
2887    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2888    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2889}
2890
2891void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
2892        ptrdiff_t stride)
2893{
2894    uint8_t full[336];
2895    uint8_t * const full_mid= full + 32;
2896    uint8_t halfH[256];
2897    uint8_t halfV[256];
2898    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
2899    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
2900    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2901    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2902}
2903
2904void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
2905        ptrdiff_t stride)
2906{
2907    uint8_t full[336];
2908    uint8_t * const full_mid= full + 32;
2909    uint8_t halfH[256];
2910    uint8_t halfV[256];
2911    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
2912    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
2913    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2914    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2915}
2916
2917void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
2918        ptrdiff_t stride)
2919{
2920    uint16_t __attribute__ ((aligned(8))) temp[384];
2921
2922    put_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride);
2923}
2924
2925void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
2926        ptrdiff_t stride)
2927{
2928    uint8_t __attribute__ ((aligned(8))) temp[1024];
2929    uint8_t *const halfHV = temp;
2930    int16_t *const halfV = (int16_t *) (temp + 256);
2931
2932    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
2933    put_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16);
2934}
2935
2936void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
2937        ptrdiff_t stride)
2938{
2939    uint8_t __attribute__ ((aligned(8))) temp[1024];
2940    uint8_t *const halfHV = temp;
2941    int16_t *const halfV = (int16_t *) (temp + 256);
2942
2943    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
2944    put_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16);
2945}
2946
2947void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
2948        ptrdiff_t stride)
2949{
2950    uint8_t __attribute__ ((aligned(8))) temp[1024];
2951    uint8_t *const halfHV = temp;
2952    int16_t *const halfV = (int16_t *) (temp + 256);
2953
2954    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
2955    put_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16);
2956}
2957
2958void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
2959        ptrdiff_t stride)
2960{
2961    uint8_t __attribute__ ((aligned(8))) temp[1024];
2962    uint8_t *const halfHV = temp;
2963    int16_t *const halfV = (int16_t *) (temp + 256);
2964
2965    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
2966    put_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16);
2967}
2968
2969//DEF_H264_MC_MMI(avg_, 16)
2970void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
2971        ptrdiff_t stride)
2972{
2973    ff_avg_pixels16_8_mmi(dst, src, stride, 16);
2974}
2975
2976void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
2977        ptrdiff_t stride)
2978{
2979    uint8_t half[256];
2980    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2981    ff_avg_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16);
2982}
2983
2984void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
2985        ptrdiff_t stride)
2986{
2987    avg_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
2988}
2989
2990void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
2991        ptrdiff_t stride)
2992{
2993    uint8_t half[256];
2994    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2995    ff_avg_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16);
2996}
2997
2998void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
2999        ptrdiff_t stride)
3000{
3001    uint8_t full[336];
3002    uint8_t * const full_mid= full + 32;
3003    uint8_t half[256];
3004    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3005    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
3006    ff_avg_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16);
3007}
3008
3009void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
3010        ptrdiff_t stride)
3011{
3012    uint8_t full[336];
3013    uint8_t * const full_mid= full + 32;
3014    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3015    avg_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
3016}
3017
3018void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
3019        ptrdiff_t stride)
3020{
3021    uint8_t full[336];
3022    uint8_t * const full_mid= full + 32;
3023    uint8_t half[256];
3024    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3025    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
3026    ff_avg_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
3027}
3028
3029void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
3030        ptrdiff_t stride)
3031{
3032    uint8_t full[336];
3033    uint8_t * const full_mid= full + 32;
3034    uint8_t halfH[256];
3035    uint8_t halfV[256];
3036    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
3037    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3038    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3039    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3040}
3041
3042void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
3043        ptrdiff_t stride)
3044{
3045    uint8_t full[336];
3046    uint8_t * const full_mid= full + 32;
3047    uint8_t halfH[256];
3048    uint8_t halfV[256];
3049    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
3050    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
3051    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3052    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3053}
3054
3055void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
3056        ptrdiff_t stride)
3057{
3058    uint8_t full[336];
3059    uint8_t * const full_mid= full + 32;
3060    uint8_t halfH[256];
3061    uint8_t halfV[256];
3062    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
3063    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3064    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3065    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3066}
3067
3068void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
3069        ptrdiff_t stride)
3070{
3071    uint8_t full[336];
3072    uint8_t * const full_mid= full + 32;
3073    uint8_t halfH[256];
3074    uint8_t halfV[256];
3075    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
3076    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
3077    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3078    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3079}
3080
3081void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
3082        ptrdiff_t stride)
3083{
3084    uint16_t __attribute__ ((aligned(8))) temp[384];
3085
3086    avg_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride);
3087}
3088
3089void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
3090        ptrdiff_t stride)
3091{
3092    uint8_t __attribute__ ((aligned(8))) temp[1024];
3093    uint8_t *const halfHV = temp;
3094    int16_t *const halfV = (int16_t *) (temp + 256);
3095
3096    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3097    avg_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16);
3098}
3099
3100void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
3101        ptrdiff_t stride)
3102{
3103    uint8_t __attribute__ ((aligned(8))) temp[1024];
3104    uint8_t *const halfHV = temp;
3105    int16_t *const halfV = (int16_t *) (temp + 256);
3106
3107    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3108    avg_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16);
3109}
3110
3111void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
3112        ptrdiff_t stride)
3113{
3114    uint8_t __attribute__ ((aligned(8))) temp[1024];
3115    uint8_t *const halfHV = temp;
3116    int16_t *const halfV = (int16_t *) (temp + 256);
3117
3118    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3119    avg_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16);
3120}
3121
3122void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
3123        ptrdiff_t stride)
3124{
3125    uint8_t __attribute__ ((aligned(8))) temp[1024];
3126    uint8_t *const halfHV = temp;
3127    int16_t *const halfV = (int16_t *) (temp + 256);
3128
3129    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3130    avg_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16);
3131}
3132
3133#undef op2_avg
3134#undef op2_put
3135