1/*
2 * Loongson SIMD optimized h264chroma
3 *
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include "h264chroma_mips.h"
26#include "constants.h"
27#include "libavutil/mips/mmiutils.h"
28
29void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
30        int h, int x, int y)
31{
32    double ftmp[12];
33    union mmi_intfloat64 A, B, C, D, E;
34    DECLARE_VAR_ALL64;
35
36    A.i = 64;
37
38    if (!(x || y)) {
39        /* x=0, y=0, A.i=64 */
40        __asm__ volatile (
41            "1:                                                        \n\t"
42            MMI_ULDC1(%[ftmp0], %[src], 0x00)
43            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
44            MMI_ULDC1(%[ftmp1], %[src], 0x00)
45            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
46            MMI_ULDC1(%[ftmp2], %[src], 0x00)
47            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
48            MMI_ULDC1(%[ftmp3], %[src], 0x00)
49            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
50
51            "addi       %[h],       %[h],           -0x04              \n\t"
52
53            MMI_SDC1(%[ftmp0], %[dst], 0x00)
54            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
55            MMI_SDC1(%[ftmp1], %[dst], 0x00)
56            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
57            MMI_SDC1(%[ftmp2], %[dst], 0x00)
58            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
59            MMI_SDC1(%[ftmp3], %[dst], 0x00)
60            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
61            "bnez       %[h],       1b                                 \n\t"
62            : RESTRICT_ASM_ALL64
63              [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
64              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
65              [dst]"+&r"(dst),              [src]"+&r"(src),
66              [h]"+&r"(h)
67            : [stride]"r"((mips_reg)stride)
68            : "memory"
69        );
70    } else if (x && y) {
71        /* x!=0, y!=0 */
72        D.i = x * y;
73        B.i = (x << 3) - D.i;
74        C.i = (y << 3) - D.i;
75        A.i = 64 - D.i - B.i - C.i;
76
77        __asm__ volatile (
78            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]           \n\t"
79            "pshufh     %[A],       %[A],           %[ftmp0]           \n\t"
80            "pshufh     %[B],       %[B],           %[ftmp0]           \n\t"
81            "mtc1       %[tmp0],    %[ftmp9]                           \n\t"
82            "pshufh     %[C],       %[C],           %[ftmp0]           \n\t"
83            "pshufh     %[D],       %[D],           %[ftmp0]           \n\t"
84
85            "1:                                                        \n\t"
86            MMI_ULDC1(%[ftmp1], %[src], 0x00)
87            MMI_ULDC1(%[ftmp2], %[src], 0x01)
88            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
89            MMI_ULDC1(%[ftmp3], %[src], 0x00)
90            MMI_ULDC1(%[ftmp4], %[src], 0x01)
91            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
92            MMI_ULDC1(%[ftmp10], %[src], 0x00)
93            MMI_ULDC1(%[ftmp11], %[src], 0x01)
94            "addi       %[h],       %[h],           -0x02              \n\t"
95
96            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]           \n\t"
97            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]           \n\t"
98            "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]           \n\t"
99            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]           \n\t"
100            "pmullh     %[ftmp5],   %[ftmp5],       %[A]               \n\t"
101            "pmullh     %[ftmp7],   %[ftmp7],       %[B]               \n\t"
102            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]           \n\t"
103            "pmullh     %[ftmp6],   %[ftmp6],       %[A]               \n\t"
104            "pmullh     %[ftmp8],   %[ftmp8],       %[B]               \n\t"
105            "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]           \n\t"
106            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]           \n\t"
107            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]           \n\t"
108            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]           \n\t"
109            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]           \n\t"
110            "pmullh     %[ftmp5],   %[ftmp5],       %[C]               \n\t"
111            "pmullh     %[ftmp7],   %[ftmp7],       %[D]               \n\t"
112            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]           \n\t"
113            "pmullh     %[ftmp6],   %[ftmp6],       %[C]               \n\t"
114            "pmullh     %[ftmp8],   %[ftmp8],       %[D]               \n\t"
115            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp8]           \n\t"
116            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]           \n\t"
117            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]           \n\t"
118            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]        \n\t"
119            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]        \n\t"
120            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]           \n\t"
121            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]           \n\t"
122            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]           \n\t"
123
124            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]           \n\t"
125            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]           \n\t"
126            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]           \n\t"
127            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]           \n\t"
128            "pmullh     %[ftmp5],   %[ftmp5],       %[A]               \n\t"
129            "pmullh     %[ftmp7],   %[ftmp7],       %[B]               \n\t"
130            "paddh      %[ftmp3],   %[ftmp5],       %[ftmp7]           \n\t"
131            "pmullh     %[ftmp6],   %[ftmp6],       %[A]               \n\t"
132            "pmullh     %[ftmp8],   %[ftmp8],       %[B]               \n\t"
133            "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]           \n\t"
134            "punpcklbh  %[ftmp5],   %[ftmp10],      %[ftmp0]           \n\t"
135            "punpckhbh  %[ftmp6],   %[ftmp10],      %[ftmp0]           \n\t"
136            "punpcklbh  %[ftmp7],   %[ftmp11],      %[ftmp0]           \n\t"
137            "punpckhbh  %[ftmp8],   %[ftmp11],      %[ftmp0]           \n\t"
138            "pmullh     %[ftmp5],   %[ftmp5],       %[C]               \n\t"
139            "pmullh     %[ftmp7],   %[ftmp7],       %[D]               \n\t"
140            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]           \n\t"
141            "pmullh     %[ftmp6],   %[ftmp6],       %[C]               \n\t"
142            "pmullh     %[ftmp8],   %[ftmp8],       %[D]               \n\t"
143            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp8]           \n\t"
144            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]           \n\t"
145            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]           \n\t"
146            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_32]        \n\t"
147            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_32]        \n\t"
148            "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp9]           \n\t"
149            "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]           \n\t"
150            "packushb   %[ftmp3],   %[ftmp3],       %[ftmp4]           \n\t"
151
152            MMI_SDC1(%[ftmp1], %[dst], 0x00)
153            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
154            MMI_SDC1(%[ftmp3], %[dst], 0x00)
155            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
156            "bnez       %[h],       1b                                 \n\t"
157            : RESTRICT_ASM_ALL64
158              [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
159              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
160              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
161              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
162              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
163              [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
164              [dst]"+&r"(dst),              [src]"+&r"(src),
165              [h]"+&r"(h)
166            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32.f),
167              [A]"f"(A.f),                  [B]"f"(B.f),
168              [C]"f"(C.f),                  [D]"f"(D.f),
169              [tmp0]"r"(0x06)
170            : "memory"
171        );
172    } else if (x) {
173        /* x!=0, y==0 */
174        E.i = x << 3;
175        A.i = 64 - E.i;
176
177        __asm__ volatile (
178            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]           \n\t"
179            "pshufh     %[A],       %[A],           %[ftmp0]           \n\t"
180            "pshufh     %[E],       %[E],           %[ftmp0]           \n\t"
181            "mtc1       %[tmp0],    %[ftmp7]                           \n\t"
182
183            "1:                                                        \n\t"
184            MMI_ULDC1(%[ftmp1], %[src], 0x00)
185            MMI_ULDC1(%[ftmp2], %[src], 0x01)
186            "addi       %[h],       %[h],           -0x01              \n\t"
187            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
188
189            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]           \n\t"
190            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]           \n\t"
191            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]           \n\t"
192            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]           \n\t"
193            "pmullh     %[ftmp3],   %[ftmp3],       %[A]               \n\t"
194            "pmullh     %[ftmp5],   %[ftmp5],       %[E]               \n\t"
195            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]           \n\t"
196            "pmullh     %[ftmp4],   %[ftmp4],       %[A]               \n\t"
197            "pmullh     %[ftmp6],   %[ftmp6],       %[E]               \n\t"
198            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]           \n\t"
199
200            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]        \n\t"
201            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]        \n\t"
202            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]           \n\t"
203            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]           \n\t"
204            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]           \n\t"
205            MMI_SDC1(%[ftmp1], %[dst], 0x00)
206            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
207            "bnez       %[h],       1b                                 \n\t"
208            : RESTRICT_ASM_ALL64
209              [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
210              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
211              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
212              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
213              [dst]"+&r"(dst),              [src]"+&r"(src),
214              [h]"+&r"(h)
215            : [stride]"r"((mips_reg)stride),
216              [ff_pw_32]"f"(ff_pw_32.f),    [tmp0]"r"(0x06),
217              [A]"f"(A.f),                  [E]"f"(E.f)
218            : "memory"
219        );
220    } else {
221        /* x==0, y!=0 */
222        E.i = y << 3;
223        A.i = 64 - E.i;
224
225        __asm__ volatile (
226            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]           \n\t"
227            "pshufh     %[A],       %[A],           %[ftmp0]           \n\t"
228            "pshufh     %[E],       %[E],           %[ftmp0]           \n\t"
229            "mtc1       %[tmp0],    %[ftmp7]                           \n\t"
230
231            "1:                                                        \n\t"
232            MMI_ULDC1(%[ftmp1], %[src], 0x00)
233            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
234            MMI_ULDC1(%[ftmp2], %[src], 0x00)
235            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
236            MMI_ULDC1(%[ftmp8], %[src], 0x00)
237            "addi       %[h],       %[h],           -0x02              \n\t"
238
239            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]           \n\t"
240            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]           \n\t"
241            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]           \n\t"
242            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]           \n\t"
243            "pmullh     %[ftmp3],   %[ftmp3],       %[A]               \n\t"
244            "pmullh     %[ftmp5],   %[ftmp5],       %[E]               \n\t"
245            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]           \n\t"
246            "pmullh     %[ftmp4],   %[ftmp4],       %[A]               \n\t"
247            "pmullh     %[ftmp6],   %[ftmp6],       %[E]               \n\t"
248            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]           \n\t"
249            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_32]        \n\t"
250            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_32]        \n\t"
251            "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp7]           \n\t"
252            "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp7]           \n\t"
253            "packushb   %[ftmp1],   %[ftmp3],       %[ftmp4]           \n\t"
254
255            "punpcklbh  %[ftmp3],   %[ftmp2],       %[ftmp0]           \n\t"
256            "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]           \n\t"
257            "punpcklbh  %[ftmp5],   %[ftmp8],       %[ftmp0]           \n\t"
258            "punpckhbh  %[ftmp6],   %[ftmp8],       %[ftmp0]           \n\t"
259            "pmullh     %[ftmp3],   %[ftmp3],       %[A]               \n\t"
260            "pmullh     %[ftmp5],   %[ftmp5],       %[E]               \n\t"
261            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]           \n\t"
262            "pmullh     %[ftmp4],   %[ftmp4],       %[A]               \n\t"
263            "pmullh     %[ftmp6],   %[ftmp6],       %[E]               \n\t"
264            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]           \n\t"
265            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_32]        \n\t"
266            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_32]        \n\t"
267            "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp7]           \n\t"
268            "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp7]           \n\t"
269            "packushb   %[ftmp2],   %[ftmp3],       %[ftmp4]           \n\t"
270
271            MMI_SDC1(%[ftmp1], %[dst], 0x00)
272            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
273            MMI_SDC1(%[ftmp2], %[dst], 0x00)
274            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
275            "bnez       %[h],       1b                                 \n\t"
276            : RESTRICT_ASM_ALL64
277              [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
278              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
279              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
280              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
281              [ftmp8]"=&f"(ftmp[8]),
282              [dst]"+&r"(dst),              [src]"+&r"(src),
283              [h]"+&r"(h)
284            : [stride]"r"((mips_reg)stride),
285              [ff_pw_32]"f"(ff_pw_32.f),    [A]"f"(A.f),
286              [E]"f"(E.f),                  [tmp0]"r"(0x06)
287            : "memory"
288        );
289    }
290}
291
292void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
293        int h, int x, int y)
294{
295    double ftmp[10];
296    union mmi_intfloat64 A, B, C, D, E;
297    DECLARE_VAR_ALL64;
298
299    A.i = 64;
300
301    if(!(x || y)){
302        /* x=0, y=0, A.i=64 */
303        __asm__ volatile (
304            "1:                                                         \n\t"
305            MMI_ULDC1(%[ftmp0], %[src], 0x00)
306            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
307            MMI_ULDC1(%[ftmp1], %[src], 0x00)
308            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
309            MMI_LDC1(%[ftmp2], %[dst], 0x00)
310            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
311            MMI_LDC1(%[ftmp3], %[dst], 0x00)
312            PTR_SUBU   "%[dst],     %[dst],         %[stride]           \n\t"
313            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
314            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
315            MMI_SDC1(%[ftmp0], %[dst], 0x00)
316            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
317            MMI_SDC1(%[ftmp1], %[dst], 0x00)
318            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
319            "addi       %[h],       %[h],           -0x02               \n\t"
320            "bnez       %[h],       1b                                  \n\t"
321            : RESTRICT_ASM_ALL64
322              [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
323              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
324              [dst]"+&r"(dst),              [src]"+&r"(src),
325              [h]"+&r"(h)
326            : [stride]"r"((mips_reg)stride)
327            : "memory"
328        );
329    } else if (x && y) {
330        /* x!=0, y!=0 */
331        D.i = x * y;
332        B.i = (x << 3) - D.i;
333        C.i = (y << 3) - D.i;
334        A.i = 64 - D.i - B.i - C.i;
335        __asm__ volatile (
336            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]       \n\t"
337            "pshufh     %[A],       %[A],           %[ftmp0]       \n\t"
338            "pshufh     %[B],       %[B],           %[ftmp0]       \n\t"
339            "mtc1       %[tmp0],    %[ftmp9]                       \n\t"
340            "pshufh     %[C],       %[C],           %[ftmp0]       \n\t"
341            "pshufh     %[D],       %[D],           %[ftmp0]       \n\t"
342
343            "1:                                                    \n\t"
344            MMI_ULDC1(%[ftmp1], %[src], 0x00)
345            MMI_ULDC1(%[ftmp2], %[src], 0x01)
346            PTR_ADDU   "%[src],     %[src],         %[stride]      \n\t"
347            MMI_ULDC1(%[ftmp3], %[src], 0x00)
348            MMI_ULDC1(%[ftmp4], %[src], 0x01)
349            "addi       %[h],       %[h],           -0x01          \n\t"
350
351            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]       \n\t"
352            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]       \n\t"
353            "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]       \n\t"
354            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]       \n\t"
355            "pmullh     %[ftmp5],   %[ftmp5],       %[A]           \n\t"
356            "pmullh     %[ftmp7],   %[ftmp7],       %[B]           \n\t"
357            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]       \n\t"
358            "pmullh     %[ftmp6],   %[ftmp6],       %[A]           \n\t"
359            "pmullh     %[ftmp8],   %[ftmp8],       %[B]           \n\t"
360            "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]       \n\t"
361
362            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]       \n\t"
363            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]       \n\t"
364            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]       \n\t"
365            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]       \n\t"
366            "pmullh     %[ftmp5],   %[ftmp5],       %[C]           \n\t"
367            "pmullh     %[ftmp7],   %[ftmp7],       %[D]           \n\t"
368            "paddh      %[ftmp3],   %[ftmp5],       %[ftmp7]       \n\t"
369            "pmullh     %[ftmp6],   %[ftmp6],       %[C]           \n\t"
370            "pmullh     %[ftmp8],   %[ftmp8],       %[D]           \n\t"
371            "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]       \n\t"
372
373            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]       \n\t"
374            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]       \n\t"
375            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]    \n\t"
376            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]    \n\t"
377            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]       \n\t"
378            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]       \n\t"
379            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
380            MMI_LDC1(%[ftmp2], %[dst], 0x00)
381            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
382            MMI_SDC1(%[ftmp1], %[dst], 0x00)
383            PTR_ADDU   "%[dst],     %[dst],         %[stride]      \n\t"
384            "bnez       %[h],       1b                             \n\t"
385            : RESTRICT_ASM_ALL64
386              [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
387              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
388              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
389              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
390              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
391              [dst]"+&r"(dst),              [src]"+&r"(src),
392              [h]"+&r"(h)
393            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32.f),
394              [A]"f"(A.f),                  [B]"f"(B.f),
395              [C]"f"(C.f),                  [D]"f"(D.f),
396              [tmp0]"r"(0x06)
397            : "memory"
398        );
399    } else if (x) {
400        /* x!=0, y==0 */
401        E.i = x << 3;
402        A.i = 64 - E.i;
403        __asm__ volatile (
404            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]       \n\t"
405            "pshufh     %[A],       %[A],           %[ftmp0]       \n\t"
406            "pshufh     %[E],       %[E],           %[ftmp0]       \n\t"
407            "mtc1       %[tmp0],    %[ftmp7]                       \n\t"
408
409            "1:                                                    \n\t"
410            MMI_ULDC1(%[ftmp1], %[src], 0x00)
411            MMI_ULDC1(%[ftmp2], %[src], 0x01)
412            PTR_ADDU   "%[src],     %[src],         %[stride]      \n\t"
413            "addi       %[h],       %[h],           -0x01          \n\t"
414
415            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]       \n\t"
416            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]       \n\t"
417            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]       \n\t"
418            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]       \n\t"
419            "pmullh     %[ftmp3],   %[ftmp3],       %[A]           \n\t"
420            "pmullh     %[ftmp5],   %[ftmp5],       %[E]           \n\t"
421            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]       \n\t"
422            "pmullh     %[ftmp4],   %[ftmp4],       %[A]           \n\t"
423            "pmullh     %[ftmp6],   %[ftmp6],       %[E]           \n\t"
424            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]       \n\t"
425
426            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]    \n\t"
427            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]    \n\t"
428            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]       \n\t"
429            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]       \n\t"
430            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
431            MMI_LDC1(%[ftmp2], %[dst], 0x00)
432            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
433            MMI_SDC1(%[ftmp1], %[dst], 0x00)
434            PTR_ADDU   "%[dst],     %[dst],         %[stride]      \n\t"
435            "bnez       %[h],       1b                             \n\t"
436            : RESTRICT_ASM_ALL64
437              [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
438              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
439              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
440              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
441              [dst]"+&r"(dst),              [src]"+&r"(src),
442              [h]"+&r"(h)
443            : [stride]"r"((mips_reg)stride),
444              [ff_pw_32]"f"(ff_pw_32.f),    [tmp0]"r"(0x06),
445              [A]"f"(A.f),                  [E]"f"(E.f)
446            : "memory"
447        );
448    } else {
449        /* x==0, y!=0 */
450        E.i = y << 3;
451        A.i = 64 - E.i;
452        __asm__ volatile (
453            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]       \n\t"
454            "pshufh     %[A],       %[A],           %[ftmp0]       \n\t"
455            "pshufh     %[E],       %[E],           %[ftmp0]       \n\t"
456            "mtc1       %[tmp0],    %[ftmp7]                       \n\t"
457
458            "1:                                                    \n\t"
459            MMI_ULDC1(%[ftmp1], %[src], 0x00)
460            PTR_ADDU   "%[src],     %[src],         %[stride]      \n\t"
461            MMI_ULDC1(%[ftmp2], %[src], 0x00)
462            "addi       %[h],       %[h],           -0x01          \n\t"
463
464            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]       \n\t"
465            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]       \n\t"
466            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]       \n\t"
467            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]       \n\t"
468            "pmullh     %[ftmp3],   %[ftmp3],       %[A]           \n\t"
469            "pmullh     %[ftmp5],   %[ftmp5],       %[E]           \n\t"
470            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]       \n\t"
471            "pmullh     %[ftmp4],   %[ftmp4],       %[A]           \n\t"
472            "pmullh     %[ftmp6],   %[ftmp6],       %[E]           \n\t"
473            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]       \n\t"
474
475            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]  \n\t"
476            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]  \n\t"
477            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]       \n\t"
478            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]       \n\t"
479            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
480            MMI_LDC1(%[ftmp2], %[dst], 0x00)
481            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
482            MMI_SDC1(%[ftmp1], %[dst], 0x00)
483            PTR_ADDU   "%[dst],     %[dst],         %[stride]      \n\t"
484            "bnez       %[h],       1b                             \n\t"
485            : RESTRICT_ASM_ALL64
486              [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
487              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
488              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
489              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
490              [dst]"+&r"(dst),              [src]"+&r"(src),
491              [h]"+&r"(h)
492            : [stride]"r"((mips_reg)stride),
493              [ff_pw_32]"f"(ff_pw_32.f),    [tmp0]"r"(0x06),
494              [A]"f"(A.f),                  [E]"f"(E.f)
495            : "memory"
496        );
497    }
498}
499
500void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
501        int h, int x, int y)
502{
503    double ftmp[8];
504    mips_reg addr[1];
505    union mmi_intfloat64 A, B, C, D, E;
506    DECLARE_VAR_LOW32;
507    A.i = (8 - x) * (8 - y);
508    B.i = x * (8 - y);
509    C.i = (8 - x) * y;
510    D.i = x * y;
511    E.i = B.i + C.i;
512
513    if (D.i) {
514        __asm__ volatile (
515            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
516            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
517            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
518            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
519            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
520            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
521
522            "1:                                                         \n\t"
523            MMI_ULWC1(%[ftmp1], %[src], 0x00)
524            MMI_ULWC1(%[ftmp2], %[src], 0x01)
525            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
526            MMI_ULWC1(%[ftmp3], %[src], 0x00)
527            MMI_ULWC1(%[ftmp4], %[src], 0x01)
528
529            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
530            "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
531            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
532            "pmullh     %[ftmp6],   %[ftmp6],       %[B]                \n\t"
533            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
534            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
535            "punpcklbh  %[ftmp6],   %[ftmp4],       %[ftmp0]            \n\t"
536            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
537            "pmullh     %[ftmp6],   %[ftmp6],       %[D]                \n\t"
538            "paddh      %[ftmp2],   %[ftmp5],       %[ftmp6]            \n\t"
539            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
540            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
541            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
542            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
543
544            "addi       %[h],       %[h],           -0x01               \n\t"
545            MMI_SWC1(%[ftmp1], %[dst], 0x00)
546            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
547            "bnez       %[h],       1b                                  \n\t"
548            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
549              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
550              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
551              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
552              RESTRICT_ASM_LOW32
553              [dst]"+&r"(dst),              [src]"+&r"(src),
554              [h]"+&r"(h)
555            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32.f),
556              [A]"f"(A.f),                  [B]"f"(B.f),
557              [C]"f"(C.f),                  [D]"f"(D.f),
558              [tmp0]"r"(0x06)
559            : "memory"
560        );
561    } else if (E.i) {
562        const int step = C.i ? stride : 1;
563        __asm__ volatile (
564            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
565            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
566            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
567            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
568
569            "1:                                                         \n\t"
570            MMI_ULWC1(%[ftmp1], %[src], 0x00)
571            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
572            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
573            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
574            "addi       %[h],       %[h],           -0x01               \n\t"
575            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
576            "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
577            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
578            "pmullh     %[ftmp4],   %[ftmp4],       %[E]                \n\t"
579            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp4]            \n\t"
580            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
581            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
582            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
583            MMI_SWC1(%[ftmp1], %[dst], 0x00)
584            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
585            "bnez       %[h],       1b                                  \n\t"
586            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
587              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
588              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
589              RESTRICT_ASM_LOW32
590              [addr0]"=&r"(addr[0]),
591              [dst]"+&r"(dst),              [src]"+&r"(src),
592              [h]"+&r"(h)
593            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
594              [ff_pw_32]"f"(ff_pw_32.f),    [tmp0]"r"(0x06),
595              [A]"f"(A.f),                  [E]"f"(E.f)
596            : "memory"
597        );
598    } else {
599        __asm__ volatile (
600            "1:                                                         \n\t"
601            MMI_ULWC1(%[ftmp0], %[src], 0x00)
602            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
603            MMI_ULWC1(%[ftmp1], %[src], 0x00)
604            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
605            "addi       %[h],       %[h],           -0x02               \n\t"
606            MMI_SWC1(%[ftmp0], %[dst], 0x00)
607            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
608            MMI_SWC1(%[ftmp1], %[dst], 0x00)
609            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
610            "bnez       %[h],       1b                                  \n\t"
611            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
612              [dst]"+&r"(dst),              [src]"+&r"(src),
613              RESTRICT_ASM_LOW32
614              [h]"+&r"(h)
615            : [stride]"r"((mips_reg)stride)
616            : "memory"
617        );
618    }
619}
620
621void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
622        int h, int x, int y)
623{
624    double ftmp[8];
625    mips_reg addr[1];
626    union mmi_intfloat64 A, B, C, D, E;
627    DECLARE_VAR_LOW32;
628    A.i = (8 - x) *(8 - y);
629    B.i = x * (8 - y);
630    C.i = (8 - x) * y;
631    D.i = x * y;
632    E.i = B.i + C.i;
633
634    if (D.i) {
635        __asm__ volatile (
636            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
637            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
638            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
639            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
640            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
641            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
642
643            "1:                                                         \n\t"
644            MMI_ULWC1(%[ftmp1], %[src], 0x00)
645            MMI_ULWC1(%[ftmp2], %[src], 0x01)
646            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
647            MMI_ULWC1(%[ftmp3], %[src], 0x00)
648            MMI_ULWC1(%[ftmp4], %[src], 0x01)
649
650            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
651            "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
652            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
653            "pmullh     %[ftmp6],   %[ftmp6],       %[B]                \n\t"
654            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
655            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
656            "punpcklbh  %[ftmp6],   %[ftmp4],       %[ftmp0]            \n\t"
657            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
658            "pmullh     %[ftmp6],   %[ftmp6],       %[D]                \n\t"
659            "paddh      %[ftmp2],   %[ftmp5],       %[ftmp6]            \n\t"
660            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
661            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
662            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
663            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
664            MMI_LWC1(%[ftmp2], %[dst], 0x00)
665            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
666
667            "addi       %[h],       %[h],           -0x01               \n\t"
668            MMI_SWC1(%[ftmp1], %[dst], 0x00)
669            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
670            "bnez       %[h],       1b                                  \n\t"
671            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
672              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
673              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
674              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
675              RESTRICT_ASM_LOW32
676              [dst]"+&r"(dst),              [src]"+&r"(src),
677              [h]"+&r"(h)
678            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32.f),
679              [A]"f"(A.f),                  [B]"f"(B.f),
680              [C]"f"(C.f),                  [D]"f"(D.f),
681              [tmp0]"r"(0x06)
682            : "memory"
683        );
684    } else if (E.i) {
685        const int step = C.i ? stride : 1;
686        __asm__ volatile (
687            "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
688            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
689            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
690            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
691
692            "1:                                                         \n\t"
693            MMI_ULWC1(%[ftmp1], %[src], 0x00)
694            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
695            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
696            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
697            "addi       %[h],       %[h],           -0x01               \n\t"
698            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
699            "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
700            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
701            "pmullh     %[ftmp4],   %[ftmp4],       %[E]                \n\t"
702            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp4]            \n\t"
703            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
704            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
705            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
706            MMI_LWC1(%[ftmp2], %[dst], 0x00)
707            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
708            MMI_SWC1(%[ftmp1], %[dst], 0x00)
709            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
710            "bnez       %[h],       1b                                  \n\t"
711            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
712              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
713              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
714              RESTRICT_ASM_LOW32
715              [addr0]"=&r"(addr[0]),
716              [dst]"+&r"(dst),              [src]"+&r"(src),
717              [h]"+&r"(h)
718            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
719              [ff_pw_32]"f"(ff_pw_32.f),    [tmp0]"r"(0x06),
720              [A]"f"(A.f),                  [E]"f"(E.f)
721            : "memory"
722        );
723    } else {
724        __asm__ volatile (
725            "1:                                                         \n\t"
726            MMI_ULWC1(%[ftmp0], %[src], 0x00)
727            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
728            MMI_ULWC1(%[ftmp1], %[src], 0x00)
729            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
730            "addi       %[h],       %[h],           -0x02               \n\t"
731            MMI_LWC1(%[ftmp2], %[dst], 0x00)
732            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
733            MMI_SWC1(%[ftmp0], %[dst], 0x00)
734            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
735            MMI_LWC1(%[ftmp3], %[dst], 0x00)
736            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
737            MMI_SWC1(%[ftmp1], %[dst], 0x00)
738            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
739            "bnez       %[h],       1b                                  \n\t"
740            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
741              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
742              [dst]"+&r"(dst),              [src]"+&r"(src),
743              RESTRICT_ASM_LOW32
744              [h]"+&r"(h)
745            : [stride]"r"((mips_reg)stride)
746            : "memory"
747        );
748    }
749}
750