1/*
2 * Copyright (c) 2019 Shiyou Yin (yinshiyou-hf@loongson.cn)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavcodec/hevcdec.h"
22#include "libavcodec/bit_depth_template.c"
23#include "libavcodec/mips/hevcdsp_mips.h"
24#include "libavutil/mips/mmiutils.h"
25
26#define PUT_HEVC_QPEL_H(w, x_step, src_step, dst_step)                   \
27void ff_hevc_put_hevc_qpel_h##w##_8_mmi(int16_t *dst, uint8_t *_src,     \
28                                        ptrdiff_t _srcstride,            \
29                                        int height, intptr_t mx,         \
30                                        intptr_t my, int width)          \
31{                                                                        \
32    int x, y;                                                            \
33    pixel *src = (pixel*)_src - 3;                                       \
34    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                    \
35    double ftmp[15];                                                     \
36    uint64_t rtmp[1];                                                    \
37    const int8_t *filter = ff_hevc_qpel_filters[mx - 1];                 \
38    DECLARE_VAR_ALL64;                                                   \
39                                                                         \
40    x = x_step;                                                          \
41    y = height;                                                          \
42    __asm__ volatile(                                                    \
43        MMI_LDC1(%[ftmp1], %[filter], 0x00)                              \
44        "li           %[rtmp0],      0x08                       \n\t"    \
45        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
46        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"    \
47        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"    \
48        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"    \
49        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"    \
50        "pxor         %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"    \
51                                                                         \
52        "1:                                                     \n\t"    \
53        "2:                                                     \n\t"    \
54        MMI_ULDC1(%[ftmp3], %[src], 0x00)                                \
55        MMI_ULDC1(%[ftmp4], %[src], 0x01)                                \
56        MMI_ULDC1(%[ftmp5], %[src], 0x02)                                \
57        MMI_ULDC1(%[ftmp6], %[src], 0x03)                                \
58        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"    \
59        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"    \
60        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
61        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
62        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"    \
63        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"    \
64        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"    \
65        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
66        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
67        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"    \
68        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"    \
69        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"    \
70        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
71        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
72        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"    \
73        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"    \
74        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"    \
75        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
76        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
77        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"    \
78        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],             \
79                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])            \
80        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"    \
81        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"    \
82        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"    \
83        MMI_ULDC1(%[ftmp3], %[dst], 0x00)                                \
84                                                                         \
85        "daddi        %[x],          %[x],         -0x01        \n\t"    \
86        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"    \
87        PTR_ADDIU    "%[dst],        %[dst],        0x08        \n\t"    \
88        "bnez         %[x],          2b                         \n\t"    \
89                                                                         \
90        "daddi        %[y],          %[y],         -0x01        \n\t"    \
91        "li           %[x],        " #x_step "                  \n\t"    \
92        PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"    \
93        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"    \
94        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"    \
95        PTR_ADDIU    "%[dst],        %[dst],        0x80        \n\t"    \
96        "bnez         %[y],          1b                         \n\t"    \
97        : RESTRICT_ASM_ALL64                                             \
98          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                  \
99          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                  \
100          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                  \
101          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                  \
102          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                  \
103          [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),                \
104          [src]"+&r"(src), [dst]"+&r"(dst), [y]"+&r"(y),                 \
105          [x]"+&r"(x)                                                    \
106        : [filter]"r"(filter), [stride]"r"(srcstride)                    \
107        : "memory"                                                       \
108    );                                                                   \
109}
110
111PUT_HEVC_QPEL_H(4, 1, -4, -8);
112PUT_HEVC_QPEL_H(8, 2, -8, -16);
113PUT_HEVC_QPEL_H(12, 3, -12, -24);
114PUT_HEVC_QPEL_H(16, 4, -16, -32);
115PUT_HEVC_QPEL_H(24, 6, -24, -48);
116PUT_HEVC_QPEL_H(32, 8, -32, -64);
117PUT_HEVC_QPEL_H(48, 12, -48, -96);
118PUT_HEVC_QPEL_H(64, 16, -64, -128);
119
120#define PUT_HEVC_QPEL_HV(w, x_step, src_step, dst_step)                  \
121void ff_hevc_put_hevc_qpel_hv##w##_8_mmi(int16_t *dst, uint8_t *_src,    \
122                                     ptrdiff_t _srcstride,               \
123                                     int height, intptr_t mx,            \
124                                     intptr_t my, int width)             \
125{                                                                        \
126    int x, y;                                                            \
127    const int8_t *filter;                                                \
128    pixel *src = (pixel*)_src;                                           \
129    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                    \
130    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];         \
131    int16_t *tmp = tmp_array;                                            \
132    double ftmp[15];                                                     \
133    uint64_t rtmp[1];                                                    \
134    DECLARE_VAR_ALL64;                                                   \
135                                                                         \
136    src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                        \
137    filter = ff_hevc_qpel_filters[mx - 1];                               \
138    x = x_step;                                                          \
139    y = height + QPEL_EXTRA;                                             \
140    __asm__ volatile(                                                    \
141        MMI_LDC1(%[ftmp1], %[filter], 0x00)                              \
142        "li           %[rtmp0],      0x08                       \n\t"    \
143        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
144        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"    \
145        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"    \
146        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"    \
147        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"    \
148        "pxor         %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"    \
149                                                                         \
150        "1:                                                     \n\t"    \
151        "2:                                                     \n\t"    \
152        MMI_ULDC1(%[ftmp3], %[src], 0x00)                                \
153        MMI_ULDC1(%[ftmp4], %[src], 0x01)                                \
154        MMI_ULDC1(%[ftmp5], %[src], 0x02)                                \
155        MMI_ULDC1(%[ftmp6], %[src], 0x03)                                \
156        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"    \
157        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"    \
158        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
159        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
160        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"    \
161        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"    \
162        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"    \
163        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
164        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
165        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"    \
166        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"    \
167        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"    \
168        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
169        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
170        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"    \
171        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"    \
172        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"    \
173        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
174        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
175        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"    \
176        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],             \
177                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])            \
178        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"    \
179        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"    \
180        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"    \
181        MMI_ULDC1(%[ftmp3], %[tmp], 0x00)                                \
182                                                                         \
183        "daddi        %[x],          %[x],         -0x01        \n\t"    \
184        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"    \
185        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"    \
186        "bnez         %[x],          2b                         \n\t"    \
187                                                                         \
188        "daddi        %[y],          %[y],         -0x01        \n\t"    \
189        "li           %[x],        " #x_step "                  \n\t"    \
190        PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"    \
191        PTR_ADDIU    "%[tmp],        %[tmp],     " #dst_step "  \n\t"    \
192        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"    \
193        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
194        "bnez         %[y],          1b                         \n\t"    \
195        : RESTRICT_ASM_ALL64                                             \
196          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                  \
197          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                  \
198          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                  \
199          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                  \
200          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                  \
201          [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),                \
202          [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                 \
203          [x]"+&r"(x)                                                    \
204        : [filter]"r"(filter), [stride]"r"(srcstride)                    \
205        : "memory"                                                       \
206    );                                                                   \
207                                                                         \
208    tmp    = tmp_array + QPEL_EXTRA_BEFORE * 4 -12;                      \
209    filter = ff_hevc_qpel_filters[my - 1];                               \
210    x = x_step;                                                          \
211    y = height;                                                          \
212    __asm__ volatile(                                                    \
213        MMI_LDC1(%[ftmp1], %[filter], 0x00)                              \
214        "li           %[rtmp0],      0x08                       \n\t"    \
215        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
216        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"    \
217        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"    \
218        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"    \
219        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"    \
220        "li           %[rtmp0],      0x06                       \n\t"    \
221        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
222                                                                         \
223        "1:                                                     \n\t"    \
224        "2:                                                     \n\t"    \
225        MMI_ULDC1(%[ftmp3], %[tmp], 0x00)                                \
226        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
227        MMI_ULDC1(%[ftmp4], %[tmp], 0x00)                                \
228        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
229        MMI_ULDC1(%[ftmp5], %[tmp], 0x00)                                \
230        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
231        MMI_ULDC1(%[ftmp6], %[tmp], 0x00)                                \
232        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
233        MMI_ULDC1(%[ftmp7], %[tmp], 0x00)                                \
234        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
235        MMI_ULDC1(%[ftmp8], %[tmp], 0x00)                                \
236        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
237        MMI_ULDC1(%[ftmp9], %[tmp], 0x00)                                \
238        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
239        MMI_ULDC1(%[ftmp10], %[tmp], 0x00)                               \
240        PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"    \
241        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],             \
242                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])         \
243        TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],            \
244                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])         \
245        "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"    \
246        "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"    \
247        "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"    \
248        "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"    \
249        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"    \
250        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"    \
251        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])           \
252        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"    \
253        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"    \
254        "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"    \
255        "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"    \
256        "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"    \
257        "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"    \
258        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"    \
259        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"    \
260        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])           \
261        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"    \
262        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"    \
263        "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"    \
264        MMI_USDC1(%[ftmp3], %[dst], 0x00)                               \
265                                                                         \
266        "daddi        %[x],          %[x],         -0x01        \n\t"    \
267        PTR_ADDIU    "%[dst],        %[dst],        0x08        \n\t"    \
268        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"    \
269        "bnez         %[x],          2b                         \n\t"    \
270                                                                         \
271        "daddi        %[y],          %[y],         -0x01        \n\t"    \
272        "li           %[x],        " #x_step "                  \n\t"    \
273        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"    \
274        PTR_ADDIU    "%[tmp],        %[tmp],     " #dst_step "  \n\t"    \
275        PTR_ADDIU    "%[dst],        %[dst],        0x80        \n\t"    \
276        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
277        "bnez         %[y],          1b                         \n\t"    \
278        : RESTRICT_ASM_ALL64                                             \
279          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                  \
280          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                  \
281          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                  \
282          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                  \
283          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                  \
284          [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),              \
285          [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),              \
286          [ftmp14]"=&f"(ftmp[14]), [rtmp0]"=&r"(rtmp[0]),                \
287          [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y),                 \
288          [x]"+&r"(x)                                                    \
289        : [filter]"r"(filter), [stride]"r"(srcstride)                    \
290        : "memory"                                                       \
291    );                                                                   \
292}
293
294PUT_HEVC_QPEL_HV(4, 1, -4, -8);
295PUT_HEVC_QPEL_HV(8, 2, -8, -16);
296PUT_HEVC_QPEL_HV(12, 3, -12, -24);
297PUT_HEVC_QPEL_HV(16, 4, -16, -32);
298PUT_HEVC_QPEL_HV(24, 6, -24, -48);
299PUT_HEVC_QPEL_HV(32, 8, -32, -64);
300PUT_HEVC_QPEL_HV(48, 12, -48, -96);
301PUT_HEVC_QPEL_HV(64, 16, -64, -128);
302
303#define PUT_HEVC_QPEL_BI_H(w, x_step, src_step, src2_step, dst_step)    \
304void ff_hevc_put_hevc_qpel_bi_h##w##_8_mmi(uint8_t *_dst,               \
305                                           ptrdiff_t _dststride,        \
306                                           uint8_t *_src,               \
307                                           ptrdiff_t _srcstride,        \
308                                           int16_t *src2, int height,   \
309                                           intptr_t mx, intptr_t my,    \
310                                           int width)                   \
311{                                                                       \
312    int x, y;                                                           \
313    pixel        *src       = (pixel*)_src - 3;                         \
314    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);               \
315    pixel *dst          = (pixel *)_dst;                                \
316    ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
317    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];             \
318    double ftmp[20];                                                    \
319    uint64_t rtmp[1];                                                   \
320    union av_intfloat64 shift;                                          \
321    union av_intfloat64 offset;                                         \
322    DECLARE_VAR_ALL64;                                                  \
323    DECLARE_VAR_LOW32;                                                  \
324    shift.i = 7;                                                        \
325    offset.i = 64;                                                      \
326                                                                        \
327    x = width >> 2;                                                     \
328    y = height;                                                         \
329    __asm__ volatile(                                                   \
330        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
331        "li           %[rtmp0],      0x08                       \n\t"   \
332        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
333        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
334        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
335        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
336        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
337        "pxor         %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
338        "punpcklhw    %[offset],     %[offset],     %[offset]   \n\t"   \
339        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
340                                                                        \
341        "1:                                                     \n\t"   \
342        "li           %[x],        " #x_step "                  \n\t"   \
343        "2:                                                     \n\t"   \
344        MMI_ULDC1(%[ftmp3], %[src], 0x00)                               \
345        MMI_ULDC1(%[ftmp4], %[src], 0x01)                               \
346        MMI_ULDC1(%[ftmp5], %[src], 0x02)                               \
347        MMI_ULDC1(%[ftmp6], %[src], 0x03)                               \
348        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"   \
349        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"   \
350        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
351        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
352        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"   \
353        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"   \
354        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"   \
355        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
356        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
357        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"   \
358        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
359        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"   \
360        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
361        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
362        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"   \
363        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"   \
364        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"   \
365        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
366        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
367        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"   \
368        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
369                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
370        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
371        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
372        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
373        "paddh        %[ftmp3],      %[ftmp3],      %[offset]   \n\t"   \
374        MMI_ULDC1(%[ftmp4], %[src2], 0x00)                              \
375        "li           %[rtmp0],      0x10                       \n\t"   \
376        "dmtc1        %[rtmp0],      %[ftmp8]                   \n\t"   \
377        "punpcklhw    %[ftmp5],      %[ftmp0],      %[ftmp3]    \n\t"   \
378        "punpckhhw    %[ftmp6],      %[ftmp0],      %[ftmp3]    \n\t"   \
379        "punpckhhw    %[ftmp3],      %[ftmp0],      %[ftmp4]    \n\t"   \
380        "punpcklhw    %[ftmp4],      %[ftmp0],      %[ftmp4]    \n\t"   \
381        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp8]    \n\t"   \
382        "psraw        %[ftmp6],      %[ftmp6],      %[ftmp8]    \n\t"   \
383        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp8]    \n\t"   \
384        "psraw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"   \
385        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp4]    \n\t"   \
386        "paddw        %[ftmp6],      %[ftmp6],      %[ftmp3]    \n\t"   \
387        "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"   \
388        "psraw        %[ftmp6],      %[ftmp6],      %[shift]    \n\t"   \
389        "packsswh     %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
390        "pcmpgth      %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
391        "pand         %[ftmp3],      %[ftmp5],      %[ftmp7]    \n\t"   \
392        "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
393        MMI_USWC1(%[ftmp3], %[dst], 0x00)                               \
394                                                                        \
395        "daddi        %[x],          %[x],         -0x01        \n\t"   \
396        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
397        PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
398        PTR_ADDIU    "%[src2],       %[src2],       0x08        \n\t"   \
399        "bnez         %[x],          2b                         \n\t"   \
400                                                                        \
401        "daddi        %[y],          %[y],         -0x01        \n\t"   \
402        PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"   \
403        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
404        PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"   \
405        PTR_ADDU     "%[src],        %[src],    %[src_stride]   \n\t"   \
406        PTR_ADDU     "%[dst],        %[dst],    %[dst_stride]   \n\t"   \
407        PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"   \
408        "bnez         %[y],          1b                         \n\t"   \
409        : RESTRICT_ASM_ALL64 RESTRICT_ASM_LOW32                         \
410          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
411          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
412          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
413          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
414          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
415          [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),             \
416          [ftmp12]"=&f"(ftmp[12]), [src2]"+&r"(src2),                   \
417          [dst]"+&r"(dst), [src]"+&r"(src), [y]"+&r"(y), [x]"=&r"(x),   \
418          [offset]"+&f"(offset.f), [rtmp0]"=&r"(rtmp[0])                \
419        : [src_stride]"r"(srcstride), [dst_stride]"r"(dststride),       \
420          [filter]"r"(filter), [shift]"f"(shift.f)                      \
421        : "memory"                                                      \
422    );                                                                  \
423}
424
425PUT_HEVC_QPEL_BI_H(4, 1, -4, -8, -4);
426PUT_HEVC_QPEL_BI_H(8, 2, -8, -16, -8);
427PUT_HEVC_QPEL_BI_H(12, 3, -12, -24, -12);
428PUT_HEVC_QPEL_BI_H(16, 4, -16, -32, -16);
429PUT_HEVC_QPEL_BI_H(24, 6, -24, -48, -24);
430PUT_HEVC_QPEL_BI_H(32, 8, -32, -64, -32);
431PUT_HEVC_QPEL_BI_H(48, 12, -48, -96, -48);
432PUT_HEVC_QPEL_BI_H(64, 16, -64, -128, -64);
433
434#define PUT_HEVC_QPEL_BI_HV(w, x_step, src_step, src2_step, dst_step)   \
435void ff_hevc_put_hevc_qpel_bi_hv##w##_8_mmi(uint8_t *_dst,              \
436                                            ptrdiff_t _dststride,       \
437                                            uint8_t *_src,              \
438                                            ptrdiff_t _srcstride,       \
439                                            int16_t *src2, int height,  \
440                                            intptr_t mx, intptr_t my,   \
441                                            int width)                  \
442{                                                                       \
443    int x, y;                                                           \
444    const int8_t *filter;                                               \
445    pixel *src = (pixel*)_src;                                          \
446    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
447    pixel *dst          = (pixel *)_dst;                                \
448    ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
449    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];        \
450    int16_t *tmp = tmp_array;                                           \
451    double ftmp[20];                                                    \
452    uint64_t rtmp[1];                                                   \
453    union av_intfloat64 shift;                                          \
454    union av_intfloat64 offset;                                         \
455    DECLARE_VAR_ALL64;                                                  \
456    DECLARE_VAR_LOW32;                                                  \
457    shift.i = 7;                                                        \
458    offset.i = 64;                                                      \
459                                                                        \
460    src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                       \
461    filter = ff_hevc_qpel_filters[mx - 1];                              \
462    x = width >> 2;                                                     \
463    y = height + QPEL_EXTRA;                                            \
464    __asm__ volatile(                                                   \
465        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
466        "li           %[rtmp0],      0x08                       \n\t"   \
467        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
468        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
469        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
470        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
471        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
472        "pxor         %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
473                                                                        \
474        "1:                                                     \n\t"   \
475        "2:                                                     \n\t"   \
476        MMI_ULDC1(%[ftmp3], %[src], 0x00)                               \
477        MMI_ULDC1(%[ftmp4], %[src], 0x01)                               \
478        MMI_ULDC1(%[ftmp5], %[src], 0x02)                               \
479        MMI_ULDC1(%[ftmp6], %[src], 0x03)                               \
480        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"   \
481        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"   \
482        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
483        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
484        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"   \
485        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"   \
486        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"   \
487        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
488        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
489        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"   \
490        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
491        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"   \
492        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
493        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
494        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"   \
495        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"   \
496        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"   \
497        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
498        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
499        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"   \
500        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
501                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
502        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
503        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
504        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
505        MMI_USDC1(%[ftmp3], %[tmp], 0x00)                               \
506                                                                        \
507        "daddi        %[x],          %[x],         -0x01        \n\t"   \
508        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
509        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
510        "bnez         %[x],          2b                         \n\t"   \
511                                                                        \
512        "daddi        %[y],          %[y],         -0x01        \n\t"   \
513        "li           %[x],        " #x_step "                  \n\t"   \
514        PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
515        PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
516        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
517        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
518        "bnez         %[y],          1b                         \n\t"   \
519        : RESTRICT_ASM_ALL64                                            \
520          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
521          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
522          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
523          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
524          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
525          [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),               \
526          [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
527          [x]"+&r"(x)                                                   \
528        : [filter]"r"(filter), [stride]"r"(srcstride)                   \
529        : "memory"                                                      \
530    );                                                                  \
531                                                                        \
532    tmp    = tmp_array;                                                 \
533    filter = ff_hevc_qpel_filters[my - 1];                              \
534    x = width >> 2;                                                     \
535    y = height;                                                         \
536    __asm__ volatile(                                                   \
537        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
538        "li           %[rtmp0],      0x08                       \n\t"   \
539        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
540        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
541        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
542        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
543        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
544        "li           %[rtmp0],      0x06                       \n\t"   \
545        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
546        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
547                                                                        \
548        "1:                                                     \n\t"   \
549        "li           %[x],        " #x_step "                  \n\t"   \
550        "2:                                                     \n\t"   \
551        MMI_ULDC1(%[ftmp3], %[tmp], 0x00)                               \
552        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
553        MMI_ULDC1(%[ftmp4], %[tmp], 0x00)                               \
554        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
555        MMI_ULDC1(%[ftmp5], %[tmp], 0x00)                               \
556        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
557        MMI_ULDC1(%[ftmp6], %[tmp], 0x00)                               \
558        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
559        MMI_ULDC1(%[ftmp7], %[tmp], 0x00)                               \
560        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
561        MMI_ULDC1(%[ftmp8], %[tmp], 0x00)                               \
562        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
563        MMI_ULDC1(%[ftmp9], %[tmp], 0x00)                               \
564        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
565        MMI_ULDC1(%[ftmp10], %[tmp], 0x00)                              \
566        PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"   \
567        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
568                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
569        TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],           \
570                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
571        "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"   \
572        "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"   \
573        "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"   \
574        "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"   \
575        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
576        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
577        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])          \
578        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
579        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
580        "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"   \
581        "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"   \
582        "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"   \
583        "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"   \
584        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
585        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
586        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])          \
587        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
588        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
589        "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
590        MMI_ULDC1(%[ftmp4], %[src2], 0x00)                              \
591        "pxor         %[ftmp7],      %[ftmp7],      %[ftmp7]    \n\t"   \
592        "li           %[rtmp0],      0x10                       \n\t"   \
593        "dmtc1        %[rtmp0],      %[ftmp8]                   \n\t"   \
594        "punpcklhw    %[ftmp5],      %[ftmp7],      %[ftmp3]    \n\t"   \
595        "punpckhhw    %[ftmp6],      %[ftmp7],      %[ftmp3]    \n\t"   \
596        "punpckhhw    %[ftmp3],      %[ftmp7],      %[ftmp4]    \n\t"   \
597        "punpcklhw    %[ftmp4],      %[ftmp7],      %[ftmp4]    \n\t"   \
598        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp8]    \n\t"   \
599        "psraw        %[ftmp6],      %[ftmp6],      %[ftmp8]    \n\t"   \
600        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp8]    \n\t"   \
601        "psraw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"   \
602        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp4]    \n\t"   \
603        "paddw        %[ftmp6],      %[ftmp6],      %[ftmp3]    \n\t"   \
604        "paddw        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"   \
605        "paddw        %[ftmp6],      %[ftmp6],      %[offset]   \n\t"   \
606        "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"   \
607        "psraw        %[ftmp6],      %[ftmp6],      %[shift]    \n\t"   \
608        "packsswh     %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
609        "pcmpgth      %[ftmp7],      %[ftmp5],      %[ftmp7]    \n\t"   \
610        "pand         %[ftmp3],      %[ftmp5],      %[ftmp7]    \n\t"   \
611        "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
612        MMI_USWC1(%[ftmp3], %[dst], 0x00)                               \
613                                                                        \
614        "daddi        %[x],          %[x],         -0x01        \n\t"   \
615        PTR_ADDIU    "%[src2],       %[src2],       0x08        \n\t"   \
616        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
617        PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
618        "bnez         %[x],          2b                         \n\t"   \
619                                                                        \
620        "daddi        %[y],          %[y],         -0x01        \n\t"   \
621        PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"   \
622        PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
623        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
624        PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"   \
625        PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
626        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
627        "bnez         %[y],          1b                         \n\t"   \
628        : RESTRICT_ASM_ALL64 RESTRICT_ASM_LOW32                         \
629          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
630          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
631          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
632          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
633          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
634          [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),             \
635          [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),             \
636          [ftmp14]"=&f"(ftmp[14]), [src2]"+&r"(src2),                   \
637          [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
638          [offset]"+&f"(offset.f), [rtmp0]"=&r"(rtmp[0])                \
639        : [filter]"r"(filter), [stride]"r"(dststride),                  \
640          [shift]"f"(shift.f)                                           \
641        : "memory"                                                      \
642    );                                                                  \
643}
644
645PUT_HEVC_QPEL_BI_HV(4, 1, -4, -8, -4);
646PUT_HEVC_QPEL_BI_HV(8, 2, -8, -16, -8);
647PUT_HEVC_QPEL_BI_HV(12, 3, -12, -24, -12);
648PUT_HEVC_QPEL_BI_HV(16, 4, -16, -32, -16);
649PUT_HEVC_QPEL_BI_HV(24, 6, -24, -48, -24);
650PUT_HEVC_QPEL_BI_HV(32, 8, -32, -64, -32);
651PUT_HEVC_QPEL_BI_HV(48, 12, -48, -96, -48);
652PUT_HEVC_QPEL_BI_HV(64, 16, -64, -128, -64);
653
654#define PUT_HEVC_EPEL_BI_HV(w, x_step, src_step, src2_step, dst_step)   \
655void ff_hevc_put_hevc_epel_bi_hv##w##_8_mmi(uint8_t *_dst,              \
656                                            ptrdiff_t _dststride,       \
657                                            uint8_t *_src,              \
658                                            ptrdiff_t _srcstride,       \
659                                            int16_t *src2, int height,  \
660                                            intptr_t mx, intptr_t my,   \
661                                            int width)                  \
662{                                                                       \
663    int x, y;                                                           \
664    pixel *src = (pixel *)_src;                                         \
665    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
666    pixel *dst          = (pixel *)_dst;                                \
667    ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
668    const int8_t *filter = ff_hevc_epel_filters[mx - 1];                \
669    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];        \
670    int16_t *tmp = tmp_array;                                           \
671    double  ftmp[12];                                                   \
672    uint64_t rtmp[1];                                                   \
673    union av_intfloat64 shift;                                          \
674    union av_intfloat64 offset;                                         \
675    DECLARE_VAR_ALL64;                                                  \
676    DECLARE_VAR_LOW32;                                                  \
677    shift.i = 7;                                                        \
678    offset.i = 64;                                                      \
679                                                                        \
680    src -= (EPEL_EXTRA_BEFORE * srcstride + 1);                         \
681    x = width >> 2;                                                     \
682    y = height + EPEL_EXTRA;                                            \
683    __asm__ volatile(                                                   \
684        MMI_LWC1(%[ftmp1], %[filter], 0x00)                             \
685        "li           %[rtmp0],      0x08                       \n\t"   \
686        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
687        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
688        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
689        "pxor         %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
690                                                                        \
691        "1:                                                     \n\t"   \
692        "2:                                                     \n\t"   \
693        MMI_ULDC1(%[ftmp3], %[src], 0x00)                               \
694        MMI_ULDC1(%[ftmp4], %[src], 0x01)                               \
695        MMI_ULDC1(%[ftmp5], %[src], 0x02)                               \
696        MMI_ULDC1(%[ftmp6], %[src], 0x03)                               \
697        "punpcklbh    %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
698        "pmullh       %[ftmp2],      %[ftmp2],      %[ftmp1]    \n\t"   \
699        "punpcklbh    %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
700        "pmullh       %[ftmp3],      %[ftmp3],      %[ftmp1]    \n\t"   \
701        "punpcklbh    %[ftmp4],      %[ftmp4],      %[ftmp0]    \n\t"   \
702        "pmullh       %[ftmp4],      %[ftmp4],      %[ftmp1]    \n\t"   \
703        "punpcklbh    %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
704        "pmullh       %[ftmp5],      %[ftmp5],      %[ftmp1]    \n\t"   \
705        TRANSPOSE_4H(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],            \
706                     %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9])            \
707        "paddh        %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"   \
708        "paddh        %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"   \
709        "paddh        %[ftmp2],      %[ftmp2],      %[ftmp4]    \n\t"   \
710        MMI_ULDC1(%[ftmp2], %[tmp], 0x00)                               \
711                                                                        \
712        "daddi        %[x],          %[x],         -0x01        \n\t"   \
713        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
714        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
715        "bnez         %[x],          2b                         \n\t"   \
716                                                                        \
717        "daddi        %[y],          %[y],         -0x01        \n\t"   \
718        "li           %[x],        " #x_step "                  \n\t"   \
719        PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
720        PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
721        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
722        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
723        "bnez         %[y],          1b                         \n\t"   \
724        : RESTRICT_ASM_ALL64                                            \
725          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
726          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
727          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
728          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
729          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
730          [rtmp0]"=&r"(rtmp[0]),                                        \
731          [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
732          [x]"+&r"(x)                                                   \
733        : [filter]"r"(filter), [stride]"r"(srcstride)                   \
734        : "memory"                                                      \
735    );                                                                  \
736                                                                        \
737    tmp      = tmp_array;                                               \
738    filter = ff_hevc_epel_filters[my - 1];                              \
739    x = width >> 2;                                                     \
740    y = height;                                                         \
741    __asm__ volatile(                                                   \
742        MMI_LWC1(%[ftmp1], %[filter], 0x00)                             \
743        "li           %[rtmp0],      0x08                       \n\t"   \
744        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
745        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
746        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
747        "li           %[rtmp0],      0x06                       \n\t"   \
748        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
749        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
750        "pxor         %[ftmp2],      %[ftmp2],      %[ftmp2]    \n\t"   \
751                                                                        \
752        "1:                                                     \n\t"   \
753        "li           %[x],        " #x_step "                  \n\t"   \
754        "2:                                                     \n\t"   \
755        MMI_ULDC1(%[ftmp3], %[tmp], 0x00)                               \
756        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
757        MMI_ULDC1(%[ftmp4], %[tmp], 0x00)                               \
758        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
759        MMI_ULDC1(%[ftmp5], %[tmp], 0x00)                               \
760        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
761        MMI_ULDC1(%[ftmp6], %[tmp], 0x00)                               \
762        PTR_ADDIU    "%[tmp],        %[tmp],       -0x180       \n\t"   \
763        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
764                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
765        "pmaddhw      %[ftmp7],      %[ftmp3],      %[ftmp1]    \n\t"   \
766        "pmaddhw      %[ftmp8],      %[ftmp4],      %[ftmp1]    \n\t"   \
767        TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp3], %[ftmp4])            \
768        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
769        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
770        "pmaddhw      %[ftmp7],      %[ftmp5],      %[ftmp1]    \n\t"   \
771        "pmaddhw      %[ftmp8],      %[ftmp6],      %[ftmp1]    \n\t"   \
772        TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp5], %[ftmp6])            \
773        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
774        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
775        "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
776        MMI_ULDC1(%[ftmp4], %[tmp], 0x02)                               \
777        "li           %[rtmp0],      0x10                       \n\t"   \
778        "dmtc1        %[rtmp0],      %[ftmp8]                   \n\t"   \
779        "punpcklhw    %[ftmp5],      %[ftmp2],      %[ftmp3]    \n\t"   \
780        "punpckhhw    %[ftmp6],      %[ftmp2],      %[ftmp3]    \n\t"   \
781        "punpckhhw    %[ftmp3],      %[ftmp2],      %[ftmp4]    \n\t"   \
782        "punpcklhw    %[ftmp4],      %[ftmp2],      %[ftmp4]    \n\t"   \
783        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp8]    \n\t"   \
784        "psraw        %[ftmp6],      %[ftmp6],      %[ftmp8]    \n\t"   \
785        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp8]    \n\t"   \
786        "psraw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"   \
787        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp4]    \n\t"   \
788        "paddw        %[ftmp6],      %[ftmp6],      %[ftmp3]    \n\t"   \
789        "paddw        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"   \
790        "paddw        %[ftmp6],      %[ftmp6],      %[offset]   \n\t"   \
791        "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"   \
792        "psraw        %[ftmp6],      %[ftmp6],      %[shift]    \n\t"   \
793        "packsswh     %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
794        "pcmpgth      %[ftmp7],      %[ftmp5],      %[ftmp2]    \n\t"   \
795        "pand         %[ftmp3],      %[ftmp5],      %[ftmp7]    \n\t"   \
796        "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
797        MMI_USWC1(%[ftmp3], %[dst], 0x0)                                \
798                                                                        \
799        "daddi        %[x],          %[x],         -0x01        \n\t"   \
800        PTR_ADDIU    "%[src2],       %[src2],       0x08        \n\t"   \
801        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
802        PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
803        "bnez         %[x],          2b                         \n\t"   \
804                                                                        \
805        "daddi        %[y],          %[y],         -0x01        \n\t"   \
806        PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"   \
807        PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
808        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
809        PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"   \
810        PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
811        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
812        "bnez         %[y],          1b                         \n\t"   \
813        : RESTRICT_ASM_LOW32 RESTRICT_ASM_ALL64                         \
814          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
815          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
816          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
817          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
818          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
819          [ftmp10]"=&f"(ftmp[10]), [src2]"+&r"(src2),                   \
820          [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
821          [offset]"+&f"(offset.f), [rtmp0]"=&r"(rtmp[0])                \
822        : [filter]"r"(filter), [stride]"r"(dststride),                  \
823          [shift]"f"(shift.f)                                           \
824        : "memory"                                                      \
825    );                                                                  \
826}
827
828PUT_HEVC_EPEL_BI_HV(4, 1, -4, -8, -4);
829PUT_HEVC_EPEL_BI_HV(8, 2, -8, -16, -8);
830PUT_HEVC_EPEL_BI_HV(12, 3, -12, -24, -12);
831PUT_HEVC_EPEL_BI_HV(16, 4, -16, -32, -16);
832PUT_HEVC_EPEL_BI_HV(24, 6, -24, -48, -24);
833PUT_HEVC_EPEL_BI_HV(32, 8, -32, -64, -32);
834
835#define PUT_HEVC_PEL_BI_PIXELS(w, x_step, src_step, dst_step, src2_step)  \
836void ff_hevc_put_hevc_pel_bi_pixels##w##_8_mmi(uint8_t *_dst,             \
837                                               ptrdiff_t _dststride,      \
838                                               uint8_t *_src,             \
839                                               ptrdiff_t _srcstride,      \
840                                               int16_t *src2, int height, \
841                                               intptr_t mx, intptr_t my,  \
842                                               int width)                 \
843{                                                                         \
844    int x, y;                                                             \
845    pixel *src          = (pixel *)_src;                                  \
846    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                     \
847    pixel *dst          = (pixel *)_dst;                                  \
848    ptrdiff_t dststride = _dststride / sizeof(pixel);                     \
849    double  ftmp[12];                                                     \
850    uint64_t rtmp[1];                                                     \
851    union av_intfloat64 shift;                                            \
852    DECLARE_VAR_ALL64;                                                    \
853    shift.i = 7;                                                          \
854                                                                          \
855    y = height;                                                           \
856    x = width >> 3;                                                       \
857    __asm__ volatile(                                                     \
858        "pxor         %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"     \
859        "li           %[rtmp0],      0x06                       \n\t"     \
860        "dmtc1        %[rtmp0],      %[ftmp1]                   \n\t"     \
861        "li           %[rtmp0],      0x10                       \n\t"     \
862        "dmtc1        %[rtmp0],      %[ftmp10]                  \n\t"     \
863        "li           %[rtmp0],      0x40                       \n\t"     \
864        "dmtc1        %[rtmp0],      %[offset]                  \n\t"     \
865        "punpcklhw    %[offset],     %[offset],     %[offset]   \n\t"     \
866        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"     \
867                                                                          \
868        "1:                                                     \n\t"     \
869        "2:                                                     \n\t"     \
870        MMI_ULDC1(%[ftmp5], %[src], 0x00)                                 \
871        MMI_ULDC1(%[ftmp2], %[src2], 0x00)                                \
872        MMI_ULDC1(%[ftmp3], %[src2], 0x08)                                \
873        "punpcklbh    %[ftmp4],      %[ftmp5],      %[ftmp0]    \n\t"     \
874        "punpckhbh    %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"     \
875        "psllh        %[ftmp4],      %[ftmp4],      %[ftmp1]    \n\t"     \
876        "psllh        %[ftmp5],      %[ftmp5],      %[ftmp1]    \n\t"     \
877        "paddh        %[ftmp4],      %[ftmp4],      %[offset]   \n\t"     \
878        "paddh        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"     \
879        "punpcklhw    %[ftmp6],      %[ftmp4],      %[ftmp0]    \n\t"     \
880        "punpckhhw    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"     \
881        "punpcklhw    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"     \
882        "punpckhhw    %[ftmp9],      %[ftmp5],      %[ftmp0]    \n\t"     \
883        "punpcklhw    %[ftmp4],      %[ftmp0],      %[ftmp3]    \n\t"     \
884        "punpckhhw    %[ftmp5],      %[ftmp0],      %[ftmp3]    \n\t"     \
885        "punpckhhw    %[ftmp3],      %[ftmp0],      %[ftmp2]    \n\t"     \
886        "punpcklhw    %[ftmp2],      %[ftmp0],      %[ftmp2]    \n\t"     \
887        "psraw        %[ftmp2],      %[ftmp2],      %[ftmp10]   \n\t"     \
888        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp10]   \n\t"     \
889        "psraw        %[ftmp4],      %[ftmp4],      %[ftmp10]   \n\t"     \
890        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp10]   \n\t"     \
891        "paddw        %[ftmp2],      %[ftmp2],      %[ftmp6]    \n\t"     \
892        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp7]    \n\t"     \
893        "paddw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"     \
894        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp9]    \n\t"     \
895        "psraw        %[ftmp2],      %[ftmp2],      %[shift]    \n\t"     \
896        "psraw        %[ftmp3],      %[ftmp3],      %[shift]    \n\t"     \
897        "psraw        %[ftmp4],      %[ftmp4],      %[shift]    \n\t"     \
898        "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"     \
899        "packsswh     %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"     \
900        "packsswh     %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"     \
901        "pcmpgth      %[ftmp3],      %[ftmp2],      %[ftmp0]    \n\t"     \
902        "pcmpgth      %[ftmp5],      %[ftmp4],      %[ftmp0]    \n\t"     \
903        "pand         %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"     \
904        "pand         %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"     \
905        "packushb     %[ftmp2],      %[ftmp2],      %[ftmp4]    \n\t"     \
906        MMI_USDC1(%[ftmp2], %[dst], 0x0)                                  \
907                                                                          \
908        "daddi        %[x],          %[x],         -0x01        \n\t"     \
909        PTR_ADDIU    "%[src],        %[src],        0x08        \n\t"     \
910        PTR_ADDIU    "%[dst],        %[dst],        0x08        \n\t"     \
911        PTR_ADDIU    "%[src2],       %[src2],       0x10        \n\t"     \
912        "bnez         %[x],          2b                         \n\t"     \
913                                                                          \
914        PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"     \
915        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"     \
916        PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"     \
917        "li           %[x],        " #x_step "                  \n\t"     \
918        "daddi        %[y],          %[y],         -0x01        \n\t"     \
919        PTR_ADDU     "%[src],        %[src],       %[srcstride] \n\t"     \
920        PTR_ADDU     "%[dst],        %[dst],       %[dststride] \n\t"     \
921        PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"     \
922        "bnez         %[y],          1b                         \n\t"     \
923        : RESTRICT_ASM_ALL64                                              \
924          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                   \
925          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                   \
926          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                   \
927          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                   \
928          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                   \
929          [ftmp10]"=&f"(ftmp[10]), [offset]"=&f"(ftmp[11]),               \
930          [src2]"+&r"(src2), [dst]"+&r"(dst), [src]"+&r"(src),            \
931          [x]"+&r"(x), [y]"+&r"(y), [rtmp0]"=&r"(rtmp[0])                 \
932        : [dststride]"r"(dststride), [shift]"f"(shift.f),                 \
933          [srcstride]"r"(srcstride)                                       \
934        : "memory"                                                        \
935    );                                                                    \
936}                                                                         \
937
938PUT_HEVC_PEL_BI_PIXELS(8, 1, -8, -8, -16);
939PUT_HEVC_PEL_BI_PIXELS(16, 2, -16, -16, -32);
940PUT_HEVC_PEL_BI_PIXELS(24, 3, -24, -24, -48);
941PUT_HEVC_PEL_BI_PIXELS(32, 4, -32, -32, -64);
942PUT_HEVC_PEL_BI_PIXELS(48, 6, -48, -48, -96);
943PUT_HEVC_PEL_BI_PIXELS(64, 8, -64, -64, -128);
944
945#define PUT_HEVC_QPEL_UNI_HV(w, x_step, src_step, dst_step, tmp_step)   \
946void ff_hevc_put_hevc_qpel_uni_hv##w##_8_mmi(uint8_t *_dst,             \
947                                             ptrdiff_t _dststride,      \
948                                             uint8_t *_src,             \
949                                             ptrdiff_t _srcstride,      \
950                                             int height,                \
951                                             intptr_t mx, intptr_t my,  \
952                                             int width)                 \
953{                                                                       \
954    int x, y;                                                           \
955    const int8_t *filter;                                               \
956    pixel *src = (pixel*)_src;                                          \
957    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
958    pixel *dst          = (pixel *)_dst;                                \
959    ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
960    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];        \
961    int16_t *tmp = tmp_array;                                           \
962    double ftmp[20];                                                    \
963    uint64_t rtmp[1];                                                   \
964    union av_intfloat64 shift;                                          \
965    union av_intfloat64 offset;                                         \
966    DECLARE_VAR_ALL64;                                                  \
967    DECLARE_VAR_LOW32;                                                  \
968    shift.i = 6;                                                        \
969    offset.i = 32;                                                      \
970                                                                        \
971    src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                       \
972    filter = ff_hevc_qpel_filters[mx - 1];                              \
973    x = width >> 2;                                                     \
974    y = height + QPEL_EXTRA;                                            \
975    __asm__ volatile(                                                   \
976        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
977        "li           %[rtmp0],      0x08                       \n\t"   \
978        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
979        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
980        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
981        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
982        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
983        "pxor         %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
984                                                                        \
985        "1:                                                     \n\t"   \
986        "2:                                                     \n\t"   \
987        MMI_ULDC1(%[ftmp3], %[src], 0x00)                               \
988        MMI_ULDC1(%[ftmp4], %[src], 0x01)                               \
989        MMI_ULDC1(%[ftmp5], %[src], 0x02)                               \
990        MMI_ULDC1(%[ftmp6], %[src], 0x03)                               \
991        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"   \
992        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"   \
993        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
994        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
995        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"   \
996        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"   \
997        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"   \
998        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
999        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
1000        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"   \
1001        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
1002        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"   \
1003        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
1004        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
1005        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"   \
1006        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"   \
1007        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"   \
1008        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
1009        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
1010        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"   \
1011        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
1012                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
1013        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
1014        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
1015        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
1016        MMI_USDC1(%[ftmp3], %[tmp], 0x0)                                \
1017                                                                        \
1018        "daddi        %[x],          %[x],         -0x01        \n\t"   \
1019        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
1020        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
1021        "bnez         %[x],          2b                         \n\t"   \
1022                                                                        \
1023        "daddi        %[y],          %[y],         -0x01        \n\t"   \
1024        "li           %[x],        " #x_step "                  \n\t"   \
1025        PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
1026        PTR_ADDIU    "%[tmp],        %[tmp],      " #tmp_step " \n\t"   \
1027        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
1028        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1029        "bnez         %[y],          1b                         \n\t"   \
1030        : RESTRICT_ASM_ALL64                                            \
1031          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
1032          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
1033          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
1034          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
1035          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
1036          [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),               \
1037          [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
1038          [x]"+&r"(x)                                                   \
1039        : [filter]"r"(filter), [stride]"r"(srcstride)                   \
1040        : "memory"                                                      \
1041    );                                                                  \
1042                                                                        \
1043    tmp    = tmp_array;                                                 \
1044    filter = ff_hevc_qpel_filters[my - 1];                              \
1045    x = width >> 2;                                                     \
1046    y = height;                                                         \
1047    __asm__ volatile(                                                   \
1048        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
1049        "li           %[rtmp0],      0x08                       \n\t"   \
1050        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
1051        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
1052        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
1053        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
1054        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
1055        "li           %[rtmp0],      0x06                       \n\t"   \
1056        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
1057        "punpcklhw    %[offset],     %[offset],     %[offset]   \n\t"   \
1058        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
1059                                                                        \
1060        "1:                                                     \n\t"   \
1061        "li           %[x],        " #x_step "                  \n\t"   \
1062        "2:                                                     \n\t"   \
1063        MMI_ULDC1(%[ftmp3], %[tmp], 0x00)                               \
1064        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1065        MMI_ULDC1(%[ftmp4], %[tmp], 0x00)                               \
1066        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1067        MMI_ULDC1(%[ftmp5], %[tmp], 0x00)                               \
1068        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1069        MMI_ULDC1(%[ftmp6], %[tmp], 0x00)                               \
1070        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1071        MMI_ULDC1(%[ftmp7], %[tmp], 0x00)                               \
1072        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1073        MMI_ULDC1(%[ftmp8], %[tmp], 0x00)                               \
1074        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1075        MMI_ULDC1(%[ftmp9], %[tmp], 0x00)                               \
1076        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1077        MMI_ULDC1(%[ftmp10], %[tmp], 0x00)                              \
1078        PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"   \
1079        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
1080                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
1081        TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],           \
1082                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
1083        "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"   \
1084        "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"   \
1085        "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"   \
1086        "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"   \
1087        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
1088        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
1089        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])          \
1090        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
1091        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
1092        "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"   \
1093        "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"   \
1094        "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"   \
1095        "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"   \
1096        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
1097        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
1098        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])          \
1099        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
1100        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
1101        "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
1102        "paddh        %[ftmp3],      %[ftmp3],      %[offset]   \n\t"   \
1103        "psrah        %[ftmp3],      %[ftmp3],      %[shift]    \n\t"   \
1104        "pxor         %[ftmp7],      %[ftmp7],      %[ftmp7]    \n\t"   \
1105        "pcmpgth      %[ftmp7],      %[ftmp3],      %[ftmp7]    \n\t"   \
1106        "pand         %[ftmp3],      %[ftmp3],      %[ftmp7]    \n\t"   \
1107        "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
1108        MMI_USWC1(%[ftmp3], %[dst], 0x00)                               \
1109                                                                        \
1110        "daddi        %[x],          %[x],         -0x01        \n\t"   \
1111        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
1112        PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
1113        "bnez         %[x],          2b                         \n\t"   \
1114                                                                        \
1115        "daddi        %[y],          %[y],         -0x01        \n\t"   \
1116        PTR_ADDIU    "%[tmp],        %[tmp],     " #tmp_step "  \n\t"   \
1117        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
1118        PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
1119        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
1120        "bnez         %[y],          1b                         \n\t"   \
1121        : RESTRICT_ASM_ALL64 RESTRICT_ASM_LOW32                         \
1122          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
1123          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
1124          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
1125          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
1126          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
1127          [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),             \
1128          [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),             \
1129          [ftmp14]"=&f"(ftmp[14]),                                      \
1130          [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
1131          [offset]"+&f"(offset.f), [rtmp0]"=&r"(rtmp[0])                \
1132        : [filter]"r"(filter), [stride]"r"(dststride),                  \
1133          [shift]"f"(shift.f)                                           \
1134        : "memory"                                                      \
1135    );                                                                  \
1136}
1137
1138PUT_HEVC_QPEL_UNI_HV(4, 1, -4, -4, -8);
1139PUT_HEVC_QPEL_UNI_HV(8, 2, -8, -8, -16);
1140PUT_HEVC_QPEL_UNI_HV(12, 3, -12, -12, -24);
1141PUT_HEVC_QPEL_UNI_HV(16, 4, -16, -16, -32);
1142PUT_HEVC_QPEL_UNI_HV(24, 6, -24, -24, -48);
1143PUT_HEVC_QPEL_UNI_HV(32, 8, -32, -32, -64);
1144PUT_HEVC_QPEL_UNI_HV(48, 12, -48, -48, -96);
1145PUT_HEVC_QPEL_UNI_HV(64, 16, -64, -64, -128);
1146