1/*
2 * Loongson SIMD optimized qpeldsp
3 *
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "hpeldsp_mips.h"
25#include "libavcodec/bit_depth_template.c"
26#include "libavutil/mips/mmiutils.h"
27#include "constants.h"
28
29void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
30    ptrdiff_t line_size, int h)
31{
32    double ftmp[4];
33    DECLARE_VAR_LOW32;
34
35    __asm__ volatile (
36        "1:                                                             \n\t"
37        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
38        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
39        MMI_ULWC1(%[ftmp1], %[pixels], 0x00)
40        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
41
42        PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
43
44        MMI_SWC1(%[ftmp0], %[block], 0x00)
45        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
46        MMI_SWC1(%[ftmp1], %[block], 0x00)
47        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
48
49        "bnez       %[h],       1b                                      \n\t"
50        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
51          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
52          RESTRICT_ASM_LOW32
53          [block]"+&r"(block),              [pixels]"+&r"(pixels),
54          [h]"+&r"(h)
55        : [line_size]"r"((mips_reg)line_size)
56        : "memory"
57    );
58}
59
60void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
61    ptrdiff_t line_size, int h)
62{
63    double ftmp[4];
64    DECLARE_VAR_ALL64;
65
66    __asm__ volatile (
67        "1:                                                             \n\t"
68        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
69        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
70        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
71        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
72        MMI_ULDC1(%[ftmp2], %[pixels], 0x00)
73        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
74        MMI_ULDC1(%[ftmp3], %[pixels], 0x00)
75        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
76
77        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
78
79        MMI_SDC1(%[ftmp0], %[block], 0x00)
80        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
81        MMI_SDC1(%[ftmp1], %[block], 0x00)
82        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
83        MMI_SDC1(%[ftmp2], %[block], 0x00)
84        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
85        MMI_SDC1(%[ftmp3], %[block], 0x00)
86        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
87
88        "bnez       %[h],       1b                                      \n\t"
89        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
90          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
91          RESTRICT_ASM_ALL64
92          [block]"+&r"(block),              [pixels]"+&r"(pixels),
93          [h]"+&r"(h)
94        : [line_size]"r"((mips_reg)line_size)
95        : "memory"
96    );
97}
98
99void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
100    ptrdiff_t line_size, int h)
101{
102    double ftmp[8];
103    DECLARE_VAR_ALL64;
104
105    __asm__ volatile (
106        "1:                                                            \n\t"
107        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
108        MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
109        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
110        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
111        MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
112        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
113        MMI_ULDC1(%[ftmp4], %[pixels], 0x00)
114        MMI_ULDC1(%[ftmp6], %[pixels], 0x08)
115        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
116        MMI_ULDC1(%[ftmp5], %[pixels], 0x00)
117        MMI_ULDC1(%[ftmp7], %[pixels], 0x08)
118        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
119
120        PTR_ADDI   "%[h],       %[h],           -0x04                  \n\t"
121
122        MMI_SDC1(%[ftmp0], %[block], 0x00)
123        MMI_SDC1(%[ftmp2], %[block], 0x08)
124        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
125        MMI_SDC1(%[ftmp1], %[block], 0x00)
126        MMI_SDC1(%[ftmp3], %[block], 0x08)
127        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
128        MMI_SDC1(%[ftmp4], %[block], 0x00)
129        MMI_SDC1(%[ftmp6], %[block], 0x08)
130        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
131        MMI_SDC1(%[ftmp5], %[block], 0x00)
132        MMI_SDC1(%[ftmp7], %[block], 0x08)
133        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
134
135        "bnez       %[h],       1b                                     \n\t"
136        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
137          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
138          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
139          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
140          RESTRICT_ASM_ALL64
141          [block]"+&r"(block),              [pixels]"+&r"(pixels),
142          [h]"+&r"(h)
143        : [line_size]"r"((mips_reg)line_size)
144        : "memory"
145    );
146}
147
148void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
149    ptrdiff_t line_size, int h)
150{
151    double ftmp[4];
152    mips_reg addr[2];
153    DECLARE_VAR_LOW32;
154
155    __asm__ volatile (
156        "1:                                                             \n\t"
157        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
158        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
159        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
160        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
161        MMI_ULWC1(%[ftmp2], %[block], 0x00)
162        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
163
164        PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
165
166        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
167        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
168        MMI_SWC1(%[ftmp0], %[block], 0x00)
169        MMI_SWC1(%[ftmp1], %[addr1], 0x00)
170        PTR_ADDU   "%[pixels],  %[addr0],       %[line_size]            \n\t"
171        PTR_ADDU   "%[block],   %[addr1],       %[line_size]            \n\t"
172
173        "bnez       %[h],       1b                                      \n\t"
174        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
175          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
176          RESTRICT_ASM_LOW32
177          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
178          [block]"+&r"(block),              [pixels]"+&r"(pixels),
179          [h]"+&r"(h)
180        : [line_size]"r"((mips_reg)line_size)
181        : "memory"
182    );
183}
184
185void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
186    ptrdiff_t line_size, int h)
187{
188    double ftmp[4];
189    mips_reg addr[3];
190    DECLARE_VAR_ALL64;
191    DECLARE_VAR_ADDRT;
192
193    __asm__ volatile (
194        PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
195        "1:                                                             \n\t"
196        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
197        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
198        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
199        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
200        MMI_ULDC1(%[ftmp2], %[block], 0x00)
201        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
202        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
203        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
204        MMI_SDC1(%[ftmp0], %[block], 0x00)
205        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
206        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
207        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
208
209        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
210        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
211        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
212        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
213        MMI_ULDC1(%[ftmp2], %[block], 0x00)
214        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
215        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
216        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
217        MMI_SDC1(%[ftmp0], %[block], 0x00)
218        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
219        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
220        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
221
222        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
223        "bnez       %[h],       1b                                      \n\t"
224        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
225          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
226          RESTRICT_ASM_ALL64
227          RESTRICT_ASM_ADDRT
228          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
229          [addr2]"=&r"(addr[2]),
230          [block]"+&r"(block),              [pixels]"+&r"(pixels),
231          [h]"+&r"(h)
232        : [line_size]"r"((mips_reg)line_size)
233        : "memory"
234    );
235}
236
237void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
238    ptrdiff_t line_size, int h)
239{
240    double ftmp[8];
241    mips_reg addr[1];
242    DECLARE_VAR_ALL64;
243
244    __asm__ volatile (
245        "1:                                                             \n\t"
246        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
247        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
248        MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
249        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
250        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
251        MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
252        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
253        MMI_ULDC1(%[ftmp2], %[block], 0x00)
254        MMI_ULDC1(%[ftmp6], %[block], 0x08)
255        PTR_ADDU   "%[addr0],   %[block],       %[line_size]            \n\t"
256        MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
257        MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
258        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
259        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
260        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
261        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
262        MMI_SDC1(%[ftmp0], %[block], 0x00)
263        MMI_SDC1(%[ftmp4], %[block], 0x08)
264        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
265        MMI_SDC1(%[ftmp5], %[addr0], 0x08)
266        PTR_ADDU   "%[block],   %[addr0],       %[line_size]            \n\t"
267
268        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
269        MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
270        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
271        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
272        MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
273        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
274        MMI_ULDC1(%[ftmp2], %[block], 0x00)
275        MMI_ULDC1(%[ftmp6], %[block], 0x08)
276        PTR_ADDU   "%[addr0],   %[block],       %[line_size]            \n\t"
277        MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
278        MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
279        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
280        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
281        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
282        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
283        MMI_SDC1(%[ftmp0], %[block], 0x00)
284        MMI_SDC1(%[ftmp4], %[block], 0x08)
285        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
286        MMI_SDC1(%[ftmp5], %[addr0], 0x08)
287        PTR_ADDU   "%[block],   %[addr0],       %[line_size]            \n\t"
288
289        "bnez       %[h],       1b                                      \n\t"
290        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
291          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
292          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
293          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
294          RESTRICT_ASM_ALL64
295          [addr0]"=&r"(addr[0]),
296          [block]"+&r"(block),              [pixels]"+&r"(pixels),
297          [h]"+&r"(h)
298        : [line_size]"r"((mips_reg)line_size)
299        : "memory"
300    );
301}
302
303inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
304    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
305    int h)
306{
307    double ftmp[4];
308    mips_reg addr[5];
309    DECLARE_VAR_LOW32;
310    DECLARE_VAR_ADDRT;
311
312    __asm__ volatile (
313        "1:                                                             \n\t"
314        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
315        MMI_ULWC1(%[ftmp0], %[src1], 0x00)
316        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
317        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
318        MMI_ULWC1(%[ftmp2], %[src2], 0x00)
319        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
320        PTR_ADDU   "%[src1],    %[addr0],       %[src_stride1]          \n\t"
321        PTR_ADDU   "%[src2],    %[addr1],       %[src_stride2]          \n\t"
322
323        PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
324
325        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
326        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
327        MMI_SWC1(%[ftmp0], %[dst], 0x00)
328        PTR_ADDU   "%[dst],     %[dst],         %[dst_stride]           \n\t"
329        MMI_SWC1(%[ftmp1], %[dst], 0x00)
330        PTR_ADDU   "%[dst],     %[dst],         %[dst_stride]           \n\t"
331
332        "bnez       %[h],       1b                                      \n\t"
333        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
334          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
335          RESTRICT_ASM_LOW32
336          RESTRICT_ASM_ADDRT
337          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
338          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
339          [src2]"+&r"(src2),                [h]"+&r"(h)
340        : [dst_stride]"r"((mips_reg)dst_stride),
341          [src_stride1]"r"((mips_reg)src_stride1),
342          [src_stride2]"r"((mips_reg)src_stride2)
343        : "memory"
344    );
345}
346
347inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
348    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
349    int h)
350{
351    double ftmp[4];
352    mips_reg addr[5];
353    DECLARE_VAR_ALL64;
354    DECLARE_VAR_ADDRT;
355
356    __asm__ volatile (
357        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
358        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
359        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
360
361        "1:                                                             \n\t"
362        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
363        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
364        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
365        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
366        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
367        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
368        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
369        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
370        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
371        MMI_SDC1(%[ftmp0], %[dst], 0x00)
372        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
373        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
374        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
375
376        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
377        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
378        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
379        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
380        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
381        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
382        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
383        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
384        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
385        MMI_SDC1(%[ftmp0], %[dst], 0x00)
386        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
387        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
388        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
389
390        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
391        "bnez       %[h],       1b                                      \n\t"
392        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
393          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
394          RESTRICT_ASM_ALL64
395          RESTRICT_ASM_ADDRT
396          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
397          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
398          [addr4]"=&r"(addr[4]),
399          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
400          [src2]"+&r"(src2),                [h]"+&r"(h)
401        : [dst_stride]"r"((mips_reg)dst_stride),
402          [src_stride1]"r"((mips_reg)src_stride1),
403          [src_stride2]"r"((mips_reg)src_stride2)
404        : "memory"
405    );
406}
407
408inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
409    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
410    int h)
411{
412    double ftmp[8];
413    mips_reg addr[5];
414    DECLARE_VAR_ALL64;
415    DECLARE_VAR_ADDRT;
416
417    __asm__ volatile (
418        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
419        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
420        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
421
422        "1:                                                             \n\t"
423        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
424        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
425        MMI_ULDC1(%[ftmp4], %[src1], 0x08)
426        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
427        MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
428        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
429        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
430        MMI_ULDC1(%[ftmp6], %[src2], 0x08)
431        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
432        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
433        MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
434        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
435        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
436        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
437        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
438        MMI_SDC1(%[ftmp0], %[dst], 0x00)
439        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
440        MMI_SDC1(%[ftmp4], %[dst], 0x08)
441        MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
442        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
443        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
444
445        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
446        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
447        MMI_ULDC1(%[ftmp4], %[src1], 0x08)
448        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
449        MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
450        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
451        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
452        MMI_ULDC1(%[ftmp6], %[src2], 0x08)
453        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
454        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
455        MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
456        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
457        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
458        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
459        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
460        MMI_SDC1(%[ftmp0], %[dst], 0x00)
461        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
462        MMI_SDC1(%[ftmp4], %[dst], 0x08)
463        MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
464        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
465        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
466
467        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
468        "bnez       %[h],       1b                                      \n\t"
469        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
470          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
471          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
472          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
473          RESTRICT_ASM_ALL64
474          RESTRICT_ASM_ADDRT
475          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
476          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
477          [addr4]"=&r"(addr[4]),
478          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
479          [src2]"+&r"(src2),                [h]"+&r"(h)
480        : [dst_stride]"r"((mips_reg)dst_stride),
481          [src_stride1]"r"((mips_reg)src_stride1),
482          [src_stride2]"r"((mips_reg)src_stride2)
483        : "memory"
484    );
485}
486
487inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
488    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
489    int h)
490{
491    double ftmp[6];
492    mips_reg addr[6];
493    DECLARE_VAR_LOW32;
494
495    __asm__ volatile (
496        "1:                                                             \n\t"
497        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
498        MMI_ULWC1(%[ftmp0], %[src1], 0x00)
499        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
500        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
501        MMI_ULWC1(%[ftmp2], %[src2], 0x00)
502        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
503        PTR_ADDU   "%[src1],    %[addr0],       %[src_stride1]          \n\t"
504        PTR_ADDU   "%[src2],    %[addr1],       %[src_stride2]          \n\t"
505        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
506        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
507        PTR_ADDU   "%[addr2],   %[dst],         %[dst_stride]           \n\t"
508        MMI_ULWC1(%[ftmp4], %[dst], 0x00)
509        MMI_ULWC1(%[ftmp5], %[addr2], 0x00)
510        PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
511        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
512        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
513        MMI_SWC1(%[ftmp0], %[dst], 0x00)
514        MMI_SWC1(%[ftmp1], %[addr2], 0x00)
515        PTR_ADDU   "%[dst],     %[addr2],       %[dst_stride]           \n\t"
516
517        "bnez       %[h],       1b                                      \n\t"
518        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
519          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
520          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
521          RESTRICT_ASM_LOW32
522          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
523          [addr2]"=&r"(addr[2]),
524          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
525          [src2]"+&r"(src2),                [h]"+&r"(h)
526        : [dst_stride]"r"((mips_reg)dst_stride),
527          [src_stride1]"r"((mips_reg)src_stride1),
528          [src_stride2]"r"((mips_reg)src_stride2)
529        : "memory"
530    );
531}
532
533inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
534    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
535    int h)
536{
537    double ftmp[6];
538    mips_reg addr[6];
539    DECLARE_VAR_ALL64;
540    DECLARE_VAR_ADDRT;
541
542    __asm__ volatile (
543        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
544        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
545        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
546
547        "1:                                                             \n\t"
548        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
549        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
550        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
551        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
552        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
553        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
554        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
555        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
556        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
557        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
558        MMI_ULDC1(%[ftmp4], %[dst], 0x00)
559        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
560        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
561        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
562        MMI_SDC1(%[ftmp0], %[dst], 0x00)
563        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
564        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
565        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
566
567        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
568        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
569        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
570        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
571        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
572        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
573        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
574        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
575        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
576        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
577        MMI_ULDC1(%[ftmp4], %[dst], 0x00)
578        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
579        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
580        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
581        MMI_SDC1(%[ftmp0], %[dst], 0x00)
582        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
583        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
584        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
585
586        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
587        "bnez       %[h],       1b                                      \n\t"
588        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
589          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
590          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
591          RESTRICT_ASM_ALL64
592          RESTRICT_ASM_ADDRT
593          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
594          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
595          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
596          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
597          [src2]"+&r"(src2),                [h]"+&r"(h)
598        : [dst_stride]"r"((mips_reg)dst_stride),
599          [src_stride1]"r"((mips_reg)src_stride1),
600          [src_stride2]"r"((mips_reg)src_stride2)
601        : "memory"
602    );
603}
604
605inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
606    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
607    int h)
608{
609    ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
610            src_stride2, h);
611    ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
612            src_stride1, src_stride2, h);
613}
614
615void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
616    ptrdiff_t line_size, int h)
617{
618    ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
619            line_size, h);
620}
621
622void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
623    ptrdiff_t line_size, int h)
624{
625    ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
626            line_size, h);
627}
628
629void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
630    ptrdiff_t line_size, int h)
631{
632    ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
633            line_size, h);
634}
635
636void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
637    ptrdiff_t line_size, int h)
638{
639    ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
640            line_size, h);
641}
642
643void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
644    ptrdiff_t line_size, int h)
645{
646    ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
647            line_size, h);
648}
649
650void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
651    ptrdiff_t line_size, int h)
652{
653    ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
654    ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
655}
656
657inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
658    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
659    int h)
660{
661    double ftmp[5];
662    mips_reg addr[5];
663    DECLARE_VAR_ALL64;
664    DECLARE_VAR_ADDRT;
665
666    __asm__ volatile (
667        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
668        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
669        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
670        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
671
672        "1:                                                             \n\t"
673        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
674        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
675        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
676        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
677        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
678        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
679        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
680        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
681        "pxor       %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
682        "pxor       %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
683        "pxor       %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
684        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
685        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
686        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
687        "pxor       %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
688        MMI_SDC1(%[ftmp0], %[dst], 0x00)
689        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
690        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
691        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
692
693        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
694        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
695        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
696        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
697        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
698        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
699        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
700        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
701        "pxor       %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
702        "pxor       %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
703        "pxor       %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
704        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
705        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
706        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
707        "pxor       %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
708        MMI_SDC1(%[ftmp0], %[dst], 0x00)
709        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
710        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
711        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
712
713        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
714        "bnez       %[h],       1b                                      \n\t"
715        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
716          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
717          [ftmp4]"=&f"(ftmp[4]),
718          RESTRICT_ASM_ALL64
719          RESTRICT_ASM_ADDRT
720          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
721          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
722          [addr4]"=&r"(addr[4]),
723          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
724          [src2]"+&r"(src2),                [h]"+&r"(h)
725        : [dst_stride]"r"((mips_reg)dst_stride),
726          [src_stride1]"r"((mips_reg)src_stride1),
727          [src_stride2]"r"((mips_reg)src_stride2)
728        : "memory"
729    );
730}
731
732void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
733    ptrdiff_t line_size, int h)
734{
735    ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
736            line_size, line_size, h);
737}
738
739void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
740    ptrdiff_t line_size, int h)
741{
742    ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
743    ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
744}
745
746void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
747    ptrdiff_t line_size, int h)
748{
749    ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
750            line_size, line_size, h);
751}
752
753void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
754    ptrdiff_t line_size, int h)
755{
756    ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
757            line_size, line_size, h);
758}
759
760void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
761    ptrdiff_t line_size, int h)
762{
763    ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
764            line_size, line_size, h);
765}
766
767void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
768    ptrdiff_t line_size, int h)
769{
770    ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
771            line_size, line_size, h);
772}
773
774void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
775    ptrdiff_t line_size, int h)
776{
777    ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
778            line_size, line_size, h);
779}
780
781void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
782    ptrdiff_t line_size, int h)
783{
784    ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
785    ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
786}
787
788void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
789    ptrdiff_t line_size, int h)
790{
791    ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
792            line_size, line_size, line_size, h);
793}
794
795void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
796    ptrdiff_t line_size, int h)
797{
798    ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
799    ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
800}
801
802void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
803    ptrdiff_t line_size, int h)
804{
805    /* FIXME HIGH BIT DEPTH */
806    int i;
807    const uint32_t a = AV_RN32(pixels);
808    const uint32_t b = AV_RN32(pixels + 1);
809    uint32_t l0 = (a & 0x03030303UL) +
810                  (b & 0x03030303UL) +
811                       0x02020202UL;
812    uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
813                  ((b & 0xFCFCFCFCUL) >> 2);
814    uint32_t l1, h1;
815
816    pixels += line_size;
817    for (i = 0; i < h; i += 2) {
818        uint32_t a = AV_RN32(pixels);
819        uint32_t b = AV_RN32(pixels + 1);
820        l1 = (a & 0x03030303UL) +
821             (b & 0x03030303UL);
822        h1 = ((a & 0xFCFCFCFCUL) >> 2) +
823             ((b & 0xFCFCFCFCUL) >> 2);
824        *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
825        pixels += line_size;
826        block  += line_size;
827        a  = AV_RN32(pixels);
828        b  = AV_RN32(pixels + 1);
829        l0 = (a & 0x03030303UL) +
830             (b & 0x03030303UL) +
831                  0x02020202UL;
832        h0 = ((a & 0xFCFCFCFCUL) >> 2) +
833             ((b & 0xFCFCFCFCUL) >> 2);
834        *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
835        pixels += line_size;
836        block  += line_size;
837    }
838}
839
840void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
841    ptrdiff_t line_size, int h)
842{
843#if 1
844    double ftmp[10];
845    mips_reg addr[2];
846    DECLARE_VAR_ALL64;
847    DECLARE_VAR_ADDRT;
848
849    __asm__ volatile (
850        "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
851        "dli        %[addr0],   0x0f                                    \n\t"
852        "pcmpeqw    %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
853        "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
854        "dli        %[addr0],   0x01                                    \n\t"
855        "psrlh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
856        "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
857        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
858
859        "dli        %[addr0],   0x02                                    \n\t"
860        "dmtc1      %[addr0],   %[ftmp9]                                \n\t"
861        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
862        MMI_ULDC1(%[ftmp4], %[pixels], 0x01)
863        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
864        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
865        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
866        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
867        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
868        "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
869        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
870        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
871        "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
872        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
873        ".p2align   3                                                   \n\t"
874
875        "1:                                                             \n\t"
876        PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
877        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
878        MMI_ULDC1(%[ftmp2], %[addr1], 0x01)
879        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
880        "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
881        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
882        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
883        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
884        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
885        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
886        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
887        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
888        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
889        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
890        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
891        "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
892        "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
893        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
894        MMI_SDXC1(%[ftmp4], %[block], %[addr0], 0x00)
895        PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
896        PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
897        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
898        MMI_ULDC1(%[ftmp4], %[addr1], 0x01)
899        "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
900        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
901        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
902        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
903        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
904        "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
905        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
906        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
907        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
908        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
909        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
910        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
911        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
912        "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
913        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
914        MMI_SDXC1(%[ftmp0], %[block], %[addr0], 0x00)
915        PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
916        PTR_ADDU   "%[h],       %[h],           -0x02                   \n\t"
917        "bnez       %[h],       1b                                      \n\t"
918        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
919          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
920          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
921          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
922          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
923          RESTRICT_ASM_ALL64
924          RESTRICT_ASM_ADDRT
925          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
926          [h]"+&r"(h),                      [pixels]"+&r"(pixels)
927        : [block]"r"(block),                [line_size]"r"((mips_reg)line_size)
928        : "memory"
929    );
930#else
931    /* FIXME HIGH BIT DEPTH */
932    int j;
933
934    for (j = 0; j < 2; j++) {
935        int i;
936        const uint32_t a = AV_RN32(pixels);
937        const uint32_t b = AV_RN32(pixels + 1);
938        uint32_t l0 = (a & 0x03030303UL) +
939                      (b & 0x03030303UL) +
940                           0x02020202UL;
941        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
942                      ((b & 0xFCFCFCFCUL) >> 2);
943        uint32_t l1, h1;
944
945        pixels += line_size;
946        for (i = 0; i < h; i += 2) {
947            uint32_t a = AV_RN32(pixels);
948            uint32_t b = AV_RN32(pixels + 1);
949            l1 = (a & 0x03030303UL) +
950                 (b & 0x03030303UL);
951            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
952                 ((b & 0xFCFCFCFCUL) >> 2);
953            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
954            pixels += line_size;
955            block  += line_size;
956            a  = AV_RN32(pixels);
957            b  = AV_RN32(pixels + 1);
958            l0 = (a & 0x03030303UL) +
959                 (b & 0x03030303UL) +
960                      0x02020202UL;
961            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
962                 ((b & 0xFCFCFCFCUL) >> 2);
963            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
964            pixels += line_size;
965            block  += line_size;
966        }
967        pixels += 4 - line_size * (h + 1);
968        block  += 4 - line_size * h;
969    }
970#endif
971}
972
973void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
974    ptrdiff_t line_size, int h)
975{
976    ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
977    ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
978}
979
980void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
981    ptrdiff_t line_size, int h)
982{
983    /* FIXME HIGH BIT DEPTH */
984    int i;
985    const uint32_t a = AV_RN32(pixels);
986    const uint32_t b = AV_RN32(pixels + 1);
987    uint32_t l0 = (a & 0x03030303UL) +
988                  (b & 0x03030303UL) +
989                       0x02020202UL;
990    uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
991                  ((b & 0xFCFCFCFCUL) >> 2);
992    uint32_t l1, h1;
993
994    pixels += line_size;
995    for (i = 0; i < h; i += 2) {
996        uint32_t a = AV_RN32(pixels);
997        uint32_t b = AV_RN32(pixels + 1);
998        l1 = (a & 0x03030303UL) +
999             (b & 0x03030303UL);
1000        h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1001             ((b & 0xFCFCFCFCUL) >> 2);
1002        *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1003        pixels += line_size;
1004        block  += line_size;
1005        a  = AV_RN32(pixels);
1006        b  = AV_RN32(pixels + 1);
1007        l0 = (a & 0x03030303UL) +
1008             (b & 0x03030303UL) +
1009                  0x02020202UL;
1010        h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1011             ((b & 0xFCFCFCFCUL) >> 2);
1012        *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1013        pixels += line_size;
1014        block  += line_size;
1015    }
1016}
1017
1018void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1019    ptrdiff_t line_size, int h)
1020{
1021    /* FIXME HIGH BIT DEPTH */
1022    int j;
1023
1024    for (j = 0; j < 2; j++) {
1025        int i;
1026        const uint32_t a = AV_RN32(pixels);
1027        const uint32_t b = AV_RN32(pixels + 1);
1028        uint32_t l0 = (a & 0x03030303UL) +
1029                      (b & 0x03030303UL) +
1030                           0x02020202UL;
1031        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1032                      ((b & 0xFCFCFCFCUL) >> 2);
1033        uint32_t l1, h1;
1034
1035        pixels += line_size;
1036        for (i = 0; i < h; i += 2) {
1037            uint32_t a = AV_RN32(pixels);
1038            uint32_t b = AV_RN32(pixels + 1);
1039            l1 = (a & 0x03030303UL) +
1040                 (b & 0x03030303UL);
1041            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1042                 ((b & 0xFCFCFCFCUL) >> 2);
1043            *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1044            pixels += line_size;
1045            block  += line_size;
1046            a  = AV_RN32(pixels);
1047            b  = AV_RN32(pixels + 1);
1048            l0 = (a & 0x03030303UL) +
1049                 (b & 0x03030303UL) +
1050                      0x02020202UL;
1051            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1052                 ((b & 0xFCFCFCFCUL) >> 2);
1053            *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1054            pixels += line_size;
1055            block  += line_size;
1056        }
1057        pixels += 4 - line_size * (h + 1);
1058        block  += 4 - line_size * h;
1059    }
1060}
1061
1062void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1063    ptrdiff_t line_size, int h)
1064{
1065    ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1066    ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1067}
1068
1069void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1070    ptrdiff_t line_size, int h)
1071{
1072    /* FIXME HIGH BIT DEPTH */
1073    int j;
1074
1075    for (j = 0; j < 2; j++) {
1076        int i;
1077        const uint32_t a = AV_RN32(pixels);
1078        const uint32_t b = AV_RN32(pixels + 1);
1079        uint32_t l0 = (a & 0x03030303UL) +
1080                      (b & 0x03030303UL) +
1081                           0x01010101UL;
1082        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1083                      ((b & 0xFCFCFCFCUL) >> 2);
1084        uint32_t l1, h1;
1085
1086        pixels += line_size;
1087        for (i = 0; i < h; i += 2) {
1088            uint32_t a = AV_RN32(pixels);
1089            uint32_t b = AV_RN32(pixels + 1);
1090            l1 = (a & 0x03030303UL) +
1091                 (b & 0x03030303UL);
1092            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1093                 ((b & 0xFCFCFCFCUL) >> 2);
1094            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1095            pixels += line_size;
1096            block  += line_size;
1097            a  = AV_RN32(pixels);
1098            b  = AV_RN32(pixels + 1);
1099            l0 = (a & 0x03030303UL) +
1100                 (b & 0x03030303UL) +
1101                      0x01010101UL;
1102            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1103                 ((b & 0xFCFCFCFCUL) >> 2);
1104            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1105            pixels += line_size;
1106            block  += line_size;
1107        }
1108        pixels += 4 - line_size * (h + 1);
1109        block  += 4 - line_size * h;
1110    }
1111}
1112
1113void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1114    ptrdiff_t line_size, int h)
1115{
1116    ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1117    ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1118}
1119