1 /*
2 * Loongson SIMD optimized h264chroma
3 *
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "h264chroma_mips.h"
26 #include "constants.h"
27 #include "libavutil/mips/mmiutils.h"
28
ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)29 void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
30 int h, int x, int y)
31 {
32 double ftmp[12];
33 union mmi_intfloat64 A, B, C, D, E;
34 DECLARE_VAR_ALL64;
35
36 A.i = 64;
37
38 if (!(x || y)) {
39 /* x=0, y=0, A.i=64 */
40 __asm__ volatile (
41 "1: \n\t"
42 MMI_ULDC1(%[ftmp0], %[src], 0x00)
43 PTR_ADDU "%[src], %[src], %[stride] \n\t"
44 MMI_ULDC1(%[ftmp1], %[src], 0x00)
45 PTR_ADDU "%[src], %[src], %[stride] \n\t"
46 MMI_ULDC1(%[ftmp2], %[src], 0x00)
47 PTR_ADDU "%[src], %[src], %[stride] \n\t"
48 MMI_ULDC1(%[ftmp3], %[src], 0x00)
49 PTR_ADDU "%[src], %[src], %[stride] \n\t"
50
51 "addi %[h], %[h], -0x04 \n\t"
52
53 MMI_SDC1(%[ftmp0], %[dst], 0x00)
54 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
55 MMI_SDC1(%[ftmp1], %[dst], 0x00)
56 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
57 MMI_SDC1(%[ftmp2], %[dst], 0x00)
58 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
59 MMI_SDC1(%[ftmp3], %[dst], 0x00)
60 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
61 "bnez %[h], 1b \n\t"
62 : RESTRICT_ASM_ALL64
63 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
64 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
65 [dst]"+&r"(dst), [src]"+&r"(src),
66 [h]"+&r"(h)
67 : [stride]"r"((mips_reg)stride)
68 : "memory"
69 );
70 } else if (x && y) {
71 /* x!=0, y!=0 */
72 D.i = x * y;
73 B.i = (x << 3) - D.i;
74 C.i = (y << 3) - D.i;
75 A.i = 64 - D.i - B.i - C.i;
76
77 __asm__ volatile (
78 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
79 "pshufh %[A], %[A], %[ftmp0] \n\t"
80 "pshufh %[B], %[B], %[ftmp0] \n\t"
81 "mtc1 %[tmp0], %[ftmp9] \n\t"
82 "pshufh %[C], %[C], %[ftmp0] \n\t"
83 "pshufh %[D], %[D], %[ftmp0] \n\t"
84
85 "1: \n\t"
86 MMI_ULDC1(%[ftmp1], %[src], 0x00)
87 MMI_ULDC1(%[ftmp2], %[src], 0x01)
88 PTR_ADDU "%[src], %[src], %[stride] \n\t"
89 MMI_ULDC1(%[ftmp3], %[src], 0x00)
90 MMI_ULDC1(%[ftmp4], %[src], 0x01)
91 PTR_ADDU "%[src], %[src], %[stride] \n\t"
92 MMI_ULDC1(%[ftmp10], %[src], 0x00)
93 MMI_ULDC1(%[ftmp11], %[src], 0x01)
94 "addi %[h], %[h], -0x02 \n\t"
95
96 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
97 "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
98 "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
99 "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
100 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
101 "pmullh %[ftmp7], %[ftmp7], %[B] \n\t"
102 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
103 "pmullh %[ftmp6], %[ftmp6], %[A] \n\t"
104 "pmullh %[ftmp8], %[ftmp8], %[B] \n\t"
105 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
106 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
107 "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
108 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
109 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
110 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
111 "pmullh %[ftmp7], %[ftmp7], %[D] \n\t"
112 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
113 "pmullh %[ftmp6], %[ftmp6], %[C] \n\t"
114 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t"
115 "paddh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
116 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
117 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
118 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
119 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
120 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
121 "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
122 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
123
124 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
125 "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
126 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
127 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
128 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
129 "pmullh %[ftmp7], %[ftmp7], %[B] \n\t"
130 "paddh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
131 "pmullh %[ftmp6], %[ftmp6], %[A] \n\t"
132 "pmullh %[ftmp8], %[ftmp8], %[B] \n\t"
133 "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
134 "punpcklbh %[ftmp5], %[ftmp10], %[ftmp0] \n\t"
135 "punpckhbh %[ftmp6], %[ftmp10], %[ftmp0] \n\t"
136 "punpcklbh %[ftmp7], %[ftmp11], %[ftmp0] \n\t"
137 "punpckhbh %[ftmp8], %[ftmp11], %[ftmp0] \n\t"
138 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
139 "pmullh %[ftmp7], %[ftmp7], %[D] \n\t"
140 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
141 "pmullh %[ftmp6], %[ftmp6], %[C] \n\t"
142 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t"
143 "paddh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
144 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
145 "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
146 "paddh %[ftmp3], %[ftmp3], %[ff_pw_32] \n\t"
147 "paddh %[ftmp4], %[ftmp4], %[ff_pw_32] \n\t"
148 "psrlh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
149 "psrlh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
150 "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
151
152 MMI_SDC1(%[ftmp1], %[dst], 0x00)
153 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
154 MMI_SDC1(%[ftmp3], %[dst], 0x00)
155 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
156 "bnez %[h], 1b \n\t"
157 : RESTRICT_ASM_ALL64
158 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
159 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
160 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
161 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
162 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
163 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
164 [dst]"+&r"(dst), [src]"+&r"(src),
165 [h]"+&r"(h)
166 : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32.f),
167 [A]"f"(A.f), [B]"f"(B.f),
168 [C]"f"(C.f), [D]"f"(D.f),
169 [tmp0]"r"(0x06)
170 : "memory"
171 );
172 } else if (x) {
173 /* x!=0, y==0 */
174 E.i = x << 3;
175 A.i = 64 - E.i;
176
177 __asm__ volatile (
178 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
179 "pshufh %[A], %[A], %[ftmp0] \n\t"
180 "pshufh %[E], %[E], %[ftmp0] \n\t"
181 "mtc1 %[tmp0], %[ftmp7] \n\t"
182
183 "1: \n\t"
184 MMI_ULDC1(%[ftmp1], %[src], 0x00)
185 MMI_ULDC1(%[ftmp2], %[src], 0x01)
186 "addi %[h], %[h], -0x01 \n\t"
187 PTR_ADDU "%[src], %[src], %[stride] \n\t"
188
189 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
190 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
191 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
192 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
193 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
194 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
195 "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t"
196 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
197 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
198 "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
199
200 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
201 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
202 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
203 "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
204 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
205 MMI_SDC1(%[ftmp1], %[dst], 0x00)
206 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
207 "bnez %[h], 1b \n\t"
208 : RESTRICT_ASM_ALL64
209 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
210 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
211 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
212 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
213 [dst]"+&r"(dst), [src]"+&r"(src),
214 [h]"+&r"(h)
215 : [stride]"r"((mips_reg)stride),
216 [ff_pw_32]"f"(ff_pw_32.f), [tmp0]"r"(0x06),
217 [A]"f"(A.f), [E]"f"(E.f)
218 : "memory"
219 );
220 } else {
221 /* x==0, y!=0 */
222 E.i = y << 3;
223 A.i = 64 - E.i;
224
225 __asm__ volatile (
226 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
227 "pshufh %[A], %[A], %[ftmp0] \n\t"
228 "pshufh %[E], %[E], %[ftmp0] \n\t"
229 "mtc1 %[tmp0], %[ftmp7] \n\t"
230
231 "1: \n\t"
232 MMI_ULDC1(%[ftmp1], %[src], 0x00)
233 PTR_ADDU "%[src], %[src], %[stride] \n\t"
234 MMI_ULDC1(%[ftmp2], %[src], 0x00)
235 PTR_ADDU "%[src], %[src], %[stride] \n\t"
236 MMI_ULDC1(%[ftmp8], %[src], 0x00)
237 "addi %[h], %[h], -0x02 \n\t"
238
239 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
240 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
241 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
242 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
243 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
244 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
245 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
246 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
247 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
248 "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
249 "paddh %[ftmp3], %[ftmp3], %[ff_pw_32] \n\t"
250 "paddh %[ftmp4], %[ftmp4], %[ff_pw_32] \n\t"
251 "psrlh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
252 "psrlh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
253 "packushb %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
254
255 "punpcklbh %[ftmp3], %[ftmp2], %[ftmp0] \n\t"
256 "punpckhbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
257 "punpcklbh %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
258 "punpckhbh %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
259 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
260 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
261 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
262 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
263 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
264 "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
265 "paddh %[ftmp3], %[ftmp3], %[ff_pw_32] \n\t"
266 "paddh %[ftmp4], %[ftmp4], %[ff_pw_32] \n\t"
267 "psrlh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
268 "psrlh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
269 "packushb %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
270
271 MMI_SDC1(%[ftmp1], %[dst], 0x00)
272 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
273 MMI_SDC1(%[ftmp2], %[dst], 0x00)
274 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
275 "bnez %[h], 1b \n\t"
276 : RESTRICT_ASM_ALL64
277 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
278 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
279 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
280 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
281 [ftmp8]"=&f"(ftmp[8]),
282 [dst]"+&r"(dst), [src]"+&r"(src),
283 [h]"+&r"(h)
284 : [stride]"r"((mips_reg)stride),
285 [ff_pw_32]"f"(ff_pw_32.f), [A]"f"(A.f),
286 [E]"f"(E.f), [tmp0]"r"(0x06)
287 : "memory"
288 );
289 }
290 }
291
ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)292 void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
293 int h, int x, int y)
294 {
295 double ftmp[10];
296 union mmi_intfloat64 A, B, C, D, E;
297 DECLARE_VAR_ALL64;
298
299 A.i = 64;
300
301 if(!(x || y)){
302 /* x=0, y=0, A.i=64 */
303 __asm__ volatile (
304 "1: \n\t"
305 MMI_ULDC1(%[ftmp0], %[src], 0x00)
306 PTR_ADDU "%[src], %[src], %[stride] \n\t"
307 MMI_ULDC1(%[ftmp1], %[src], 0x00)
308 PTR_ADDU "%[src], %[src], %[stride] \n\t"
309 MMI_LDC1(%[ftmp2], %[dst], 0x00)
310 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
311 MMI_LDC1(%[ftmp3], %[dst], 0x00)
312 PTR_SUBU "%[dst], %[dst], %[stride] \n\t"
313 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
314 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
315 MMI_SDC1(%[ftmp0], %[dst], 0x00)
316 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
317 MMI_SDC1(%[ftmp1], %[dst], 0x00)
318 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
319 "addi %[h], %[h], -0x02 \n\t"
320 "bnez %[h], 1b \n\t"
321 : RESTRICT_ASM_ALL64
322 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
323 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
324 [dst]"+&r"(dst), [src]"+&r"(src),
325 [h]"+&r"(h)
326 : [stride]"r"((mips_reg)stride)
327 : "memory"
328 );
329 } else if (x && y) {
330 /* x!=0, y!=0 */
331 D.i = x * y;
332 B.i = (x << 3) - D.i;
333 C.i = (y << 3) - D.i;
334 A.i = 64 - D.i - B.i - C.i;
335 __asm__ volatile (
336 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
337 "pshufh %[A], %[A], %[ftmp0] \n\t"
338 "pshufh %[B], %[B], %[ftmp0] \n\t"
339 "mtc1 %[tmp0], %[ftmp9] \n\t"
340 "pshufh %[C], %[C], %[ftmp0] \n\t"
341 "pshufh %[D], %[D], %[ftmp0] \n\t"
342
343 "1: \n\t"
344 MMI_ULDC1(%[ftmp1], %[src], 0x00)
345 MMI_ULDC1(%[ftmp2], %[src], 0x01)
346 PTR_ADDU "%[src], %[src], %[stride] \n\t"
347 MMI_ULDC1(%[ftmp3], %[src], 0x00)
348 MMI_ULDC1(%[ftmp4], %[src], 0x01)
349 "addi %[h], %[h], -0x01 \n\t"
350
351 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
352 "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
353 "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
354 "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
355 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
356 "pmullh %[ftmp7], %[ftmp7], %[B] \n\t"
357 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
358 "pmullh %[ftmp6], %[ftmp6], %[A] \n\t"
359 "pmullh %[ftmp8], %[ftmp8], %[B] \n\t"
360 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
361
362 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
363 "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
364 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
365 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
366 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
367 "pmullh %[ftmp7], %[ftmp7], %[D] \n\t"
368 "paddh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
369 "pmullh %[ftmp6], %[ftmp6], %[C] \n\t"
370 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t"
371 "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
372
373 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
374 "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
375 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
376 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
377 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
378 "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
379 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
380 MMI_LDC1(%[ftmp2], %[dst], 0x00)
381 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
382 MMI_SDC1(%[ftmp1], %[dst], 0x00)
383 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
384 "bnez %[h], 1b \n\t"
385 : RESTRICT_ASM_ALL64
386 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
387 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
388 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
389 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
390 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
391 [dst]"+&r"(dst), [src]"+&r"(src),
392 [h]"+&r"(h)
393 : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32.f),
394 [A]"f"(A.f), [B]"f"(B.f),
395 [C]"f"(C.f), [D]"f"(D.f),
396 [tmp0]"r"(0x06)
397 : "memory"
398 );
399 } else if (x) {
400 /* x!=0, y==0 */
401 E.i = x << 3;
402 A.i = 64 - E.i;
403 __asm__ volatile (
404 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
405 "pshufh %[A], %[A], %[ftmp0] \n\t"
406 "pshufh %[E], %[E], %[ftmp0] \n\t"
407 "mtc1 %[tmp0], %[ftmp7] \n\t"
408
409 "1: \n\t"
410 MMI_ULDC1(%[ftmp1], %[src], 0x00)
411 MMI_ULDC1(%[ftmp2], %[src], 0x01)
412 PTR_ADDU "%[src], %[src], %[stride] \n\t"
413 "addi %[h], %[h], -0x01 \n\t"
414
415 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
416 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
417 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
418 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
419 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
420 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
421 "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t"
422 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
423 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
424 "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
425
426 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
427 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
428 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
429 "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
430 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
431 MMI_LDC1(%[ftmp2], %[dst], 0x00)
432 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
433 MMI_SDC1(%[ftmp1], %[dst], 0x00)
434 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
435 "bnez %[h], 1b \n\t"
436 : RESTRICT_ASM_ALL64
437 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
438 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
439 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
440 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
441 [dst]"+&r"(dst), [src]"+&r"(src),
442 [h]"+&r"(h)
443 : [stride]"r"((mips_reg)stride),
444 [ff_pw_32]"f"(ff_pw_32.f), [tmp0]"r"(0x06),
445 [A]"f"(A.f), [E]"f"(E.f)
446 : "memory"
447 );
448 } else {
449 /* x==0, y!=0 */
450 E.i = y << 3;
451 A.i = 64 - E.i;
452 __asm__ volatile (
453 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
454 "pshufh %[A], %[A], %[ftmp0] \n\t"
455 "pshufh %[E], %[E], %[ftmp0] \n\t"
456 "mtc1 %[tmp0], %[ftmp7] \n\t"
457
458 "1: \n\t"
459 MMI_ULDC1(%[ftmp1], %[src], 0x00)
460 PTR_ADDU "%[src], %[src], %[stride] \n\t"
461 MMI_ULDC1(%[ftmp2], %[src], 0x00)
462 "addi %[h], %[h], -0x01 \n\t"
463
464 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
465 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
466 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
467 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
468 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
469 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
470 "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t"
471 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
472 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
473 "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
474
475 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
476 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
477 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
478 "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
479 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
480 MMI_LDC1(%[ftmp2], %[dst], 0x00)
481 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
482 MMI_SDC1(%[ftmp1], %[dst], 0x00)
483 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
484 "bnez %[h], 1b \n\t"
485 : RESTRICT_ASM_ALL64
486 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
487 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
488 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
489 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
490 [dst]"+&r"(dst), [src]"+&r"(src),
491 [h]"+&r"(h)
492 : [stride]"r"((mips_reg)stride),
493 [ff_pw_32]"f"(ff_pw_32.f), [tmp0]"r"(0x06),
494 [A]"f"(A.f), [E]"f"(E.f)
495 : "memory"
496 );
497 }
498 }
499
ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)500 void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
501 int h, int x, int y)
502 {
503 double ftmp[8];
504 mips_reg addr[1];
505 union mmi_intfloat64 A, B, C, D, E;
506 DECLARE_VAR_LOW32;
507 A.i = (8 - x) * (8 - y);
508 B.i = x * (8 - y);
509 C.i = (8 - x) * y;
510 D.i = x * y;
511 E.i = B.i + C.i;
512
513 if (D.i) {
514 __asm__ volatile (
515 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
516 "pshufh %[A], %[A], %[ftmp0] \n\t"
517 "pshufh %[B], %[B], %[ftmp0] \n\t"
518 "mtc1 %[tmp0], %[ftmp7] \n\t"
519 "pshufh %[C], %[C], %[ftmp0] \n\t"
520 "pshufh %[D], %[D], %[ftmp0] \n\t"
521
522 "1: \n\t"
523 MMI_ULWC1(%[ftmp1], %[src], 0x00)
524 MMI_ULWC1(%[ftmp2], %[src], 0x01)
525 PTR_ADDU "%[src], %[src], %[stride] \n\t"
526 MMI_ULWC1(%[ftmp3], %[src], 0x00)
527 MMI_ULWC1(%[ftmp4], %[src], 0x01)
528
529 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
530 "punpcklbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
531 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
532 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t"
533 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
534 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
535 "punpcklbh %[ftmp6], %[ftmp4], %[ftmp0] \n\t"
536 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
537 "pmullh %[ftmp6], %[ftmp6], %[D] \n\t"
538 "paddh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
539 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
540 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
541 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
542 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
543
544 "addi %[h], %[h], -0x01 \n\t"
545 MMI_SWC1(%[ftmp1], %[dst], 0x00)
546 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
547 "bnez %[h], 1b \n\t"
548 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
549 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
550 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
551 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
552 RESTRICT_ASM_LOW32
553 [dst]"+&r"(dst), [src]"+&r"(src),
554 [h]"+&r"(h)
555 : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32.f),
556 [A]"f"(A.f), [B]"f"(B.f),
557 [C]"f"(C.f), [D]"f"(D.f),
558 [tmp0]"r"(0x06)
559 : "memory"
560 );
561 } else if (E.i) {
562 const int step = C.i ? stride : 1;
563 __asm__ volatile (
564 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
565 "pshufh %[A], %[A], %[ftmp0] \n\t"
566 "pshufh %[E], %[E], %[ftmp0] \n\t"
567 "mtc1 %[tmp0], %[ftmp5] \n\t"
568
569 "1: \n\t"
570 MMI_ULWC1(%[ftmp1], %[src], 0x00)
571 PTR_ADDU "%[addr0], %[src], %[step] \n\t"
572 MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
573 PTR_ADDU "%[src], %[src], %[stride] \n\t"
574 "addi %[h], %[h], -0x01 \n\t"
575 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
576 "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
577 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
578 "pmullh %[ftmp4], %[ftmp4], %[E] \n\t"
579 "paddh %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
580 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
581 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
582 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
583 MMI_SWC1(%[ftmp1], %[dst], 0x00)
584 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
585 "bnez %[h], 1b \n\t"
586 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
587 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
588 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
589 RESTRICT_ASM_LOW32
590 [addr0]"=&r"(addr[0]),
591 [dst]"+&r"(dst), [src]"+&r"(src),
592 [h]"+&r"(h)
593 : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
594 [ff_pw_32]"f"(ff_pw_32.f), [tmp0]"r"(0x06),
595 [A]"f"(A.f), [E]"f"(E.f)
596 : "memory"
597 );
598 } else {
599 __asm__ volatile (
600 "1: \n\t"
601 MMI_ULWC1(%[ftmp0], %[src], 0x00)
602 PTR_ADDU "%[src], %[src], %[stride] \n\t"
603 MMI_ULWC1(%[ftmp1], %[src], 0x00)
604 PTR_ADDU "%[src], %[src], %[stride] \n\t"
605 "addi %[h], %[h], -0x02 \n\t"
606 MMI_SWC1(%[ftmp0], %[dst], 0x00)
607 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
608 MMI_SWC1(%[ftmp1], %[dst], 0x00)
609 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
610 "bnez %[h], 1b \n\t"
611 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
612 [dst]"+&r"(dst), [src]"+&r"(src),
613 RESTRICT_ASM_LOW32
614 [h]"+&r"(h)
615 : [stride]"r"((mips_reg)stride)
616 : "memory"
617 );
618 }
619 }
620
ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)621 void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
622 int h, int x, int y)
623 {
624 double ftmp[8];
625 mips_reg addr[1];
626 union mmi_intfloat64 A, B, C, D, E;
627 DECLARE_VAR_LOW32;
628 A.i = (8 - x) *(8 - y);
629 B.i = x * (8 - y);
630 C.i = (8 - x) * y;
631 D.i = x * y;
632 E.i = B.i + C.i;
633
634 if (D.i) {
635 __asm__ volatile (
636 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
637 "pshufh %[A], %[A], %[ftmp0] \n\t"
638 "pshufh %[B], %[B], %[ftmp0] \n\t"
639 "mtc1 %[tmp0], %[ftmp7] \n\t"
640 "pshufh %[C], %[C], %[ftmp0] \n\t"
641 "pshufh %[D], %[D], %[ftmp0] \n\t"
642
643 "1: \n\t"
644 MMI_ULWC1(%[ftmp1], %[src], 0x00)
645 MMI_ULWC1(%[ftmp2], %[src], 0x01)
646 PTR_ADDU "%[src], %[src], %[stride] \n\t"
647 MMI_ULWC1(%[ftmp3], %[src], 0x00)
648 MMI_ULWC1(%[ftmp4], %[src], 0x01)
649
650 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
651 "punpcklbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
652 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
653 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t"
654 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
655 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
656 "punpcklbh %[ftmp6], %[ftmp4], %[ftmp0] \n\t"
657 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
658 "pmullh %[ftmp6], %[ftmp6], %[D] \n\t"
659 "paddh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
660 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
661 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
662 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
663 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
664 MMI_LWC1(%[ftmp2], %[dst], 0x00)
665 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
666
667 "addi %[h], %[h], -0x01 \n\t"
668 MMI_SWC1(%[ftmp1], %[dst], 0x00)
669 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
670 "bnez %[h], 1b \n\t"
671 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
672 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
673 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
674 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
675 RESTRICT_ASM_LOW32
676 [dst]"+&r"(dst), [src]"+&r"(src),
677 [h]"+&r"(h)
678 : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32.f),
679 [A]"f"(A.f), [B]"f"(B.f),
680 [C]"f"(C.f), [D]"f"(D.f),
681 [tmp0]"r"(0x06)
682 : "memory"
683 );
684 } else if (E.i) {
685 const int step = C.i ? stride : 1;
686 __asm__ volatile (
687 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
688 "pshufh %[A], %[A], %[ftmp0] \n\t"
689 "pshufh %[E], %[E], %[ftmp0] \n\t"
690 "mtc1 %[tmp0], %[ftmp5] \n\t"
691
692 "1: \n\t"
693 MMI_ULWC1(%[ftmp1], %[src], 0x00)
694 PTR_ADDU "%[addr0], %[src], %[step] \n\t"
695 MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
696 PTR_ADDU "%[src], %[src], %[stride] \n\t"
697 "addi %[h], %[h], -0x01 \n\t"
698 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
699 "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
700 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
701 "pmullh %[ftmp4], %[ftmp4], %[E] \n\t"
702 "paddh %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
703 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
704 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
705 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
706 MMI_LWC1(%[ftmp2], %[dst], 0x00)
707 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
708 MMI_SWC1(%[ftmp1], %[dst], 0x00)
709 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
710 "bnez %[h], 1b \n\t"
711 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
712 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
713 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
714 RESTRICT_ASM_LOW32
715 [addr0]"=&r"(addr[0]),
716 [dst]"+&r"(dst), [src]"+&r"(src),
717 [h]"+&r"(h)
718 : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
719 [ff_pw_32]"f"(ff_pw_32.f), [tmp0]"r"(0x06),
720 [A]"f"(A.f), [E]"f"(E.f)
721 : "memory"
722 );
723 } else {
724 __asm__ volatile (
725 "1: \n\t"
726 MMI_ULWC1(%[ftmp0], %[src], 0x00)
727 PTR_ADDU "%[src], %[src], %[stride] \n\t"
728 MMI_ULWC1(%[ftmp1], %[src], 0x00)
729 PTR_ADDU "%[src], %[src], %[stride] \n\t"
730 "addi %[h], %[h], -0x02 \n\t"
731 MMI_LWC1(%[ftmp2], %[dst], 0x00)
732 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
733 MMI_SWC1(%[ftmp0], %[dst], 0x00)
734 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
735 MMI_LWC1(%[ftmp3], %[dst], 0x00)
736 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
737 MMI_SWC1(%[ftmp1], %[dst], 0x00)
738 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
739 "bnez %[h], 1b \n\t"
740 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
741 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
742 [dst]"+&r"(dst), [src]"+&r"(src),
743 RESTRICT_ASM_LOW32
744 [h]"+&r"(h)
745 : [stride]"r"((mips_reg)stride)
746 : "memory"
747 );
748 }
749 }
750