1/*
2 * Loongson SIMD utils
3 *
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#ifndef AVUTIL_MIPS_MMIUTILS_H
25#define AVUTIL_MIPS_MMIUTILS_H
26
27#include "config.h"
28
29#include "libavutil/mem_internal.h"
30#include "libavutil/mips/asmdefs.h"
31
32/*
33 * These were used to define temporary registers for MMI marcos
34 * however now we're using $at. They're theoretically unnecessary
35 * but just leave them here to avoid mess.
36 */
37#define DECLARE_VAR_LOW32
38#define RESTRICT_ASM_LOW32
39#define DECLARE_VAR_ALL64
40#define RESTRICT_ASM_ALL64
41#define DECLARE_VAR_ADDRT
42#define RESTRICT_ASM_ADDRT
43
44#if HAVE_LOONGSON2
45
46#define MMI_LWX(reg, addr, stride, bias)                                    \
47    ".set noat                                                 \n\t"   \
48    PTR_ADDU    "$at,  "#addr",    "#stride"                   \n\t"   \
49    "lw         "#reg",     "#bias"($at)                       \n\t"   \
50    ".set at                                                   \n\t"
51
52#define MMI_SWX(reg, addr, stride, bias)                                    \
53    ".set noat                                                 \n\t"   \
54    PTR_ADDU    "$at,  "#addr",    "#stride"                   \n\t"   \
55    "sw         "#reg",     "#bias"($at)                       \n\t"   \
56    ".set at                                                   \n\t"
57
58#define MMI_LDX(reg, addr, stride, bias)                                    \
59    ".set noat                                                 \n\t"   \
60    PTR_ADDU    "$at,  "#addr",    "#stride"                   \n\t"   \
61    "ld         "#reg",     "#bias"($at)                       \n\t"   \
62    ".set at                                                   \n\t"
63
64#define MMI_SDX(reg, addr, stride, bias)                                    \
65    ".set noat                                                 \n\t"   \
66    PTR_ADDU    "$at,  "#addr",    "#stride"                   \n\t"   \
67    "sd         "#reg",     "#bias"($at)                       \n\t"   \
68    ".set at                                                   \n\t"
69
70#define MMI_LWC1(fp, addr, bias)                                            \
71    "lwc1       "#fp",      "#bias"("#addr")                        \n\t"
72
73#define MMI_ULWC1(fp, addr, bias)                                           \
74    ".set noat                                                      \n\t"   \
75    "ulw        $at,   "#bias"("#addr")                             \n\t"   \
76    "mtc1       $at,   "#fp"                                        \n\t"   \
77    ".set at                                                        \n\t"
78
79#define MMI_LWXC1(fp, addr, stride, bias)                                   \
80    ".set noat                                                 \n\t"   \
81    PTR_ADDU    "$at,  "#addr",    "#stride"                   \n\t"   \
82    MMI_LWC1(fp, $at, bias)                                            \
83    ".set at                                                   \n\t"
84
85#define MMI_SWC1(fp, addr, bias)                                            \
86    "swc1       "#fp",      "#bias"("#addr")                        \n\t"
87
88#define MMI_USWC1(fp, addr, bias)                                           \
89    ".set noat                                                      \n\t"   \
90    "mfc1       $at,   "#fp"                                        \n\t"   \
91    "usw        $at,   "#bias"("#addr")                             \n\t"   \
92    ".set at                                                        \n\t"
93
94#define MMI_SWXC1(fp, addr, stride, bias)                                   \
95    ".set noat                                                 \n\t"   \
96    PTR_ADDU    "$at,  "#addr",    "#stride"                   \n\t"   \
97    MMI_SWC1(fp, $at, bias)                                           \
98    ".set at                                                   \n\t"
99
100#define MMI_LDC1(fp, addr, bias)                                            \
101    "ldc1       "#fp",      "#bias"("#addr")                        \n\t"
102
103#define MMI_ULDC1(fp, addr, bias)                                           \
104    ".set noat                                                      \n\t"   \
105    "uld        $at,   "#bias"("#addr")                             \n\t"   \
106    "dmtc1      $at,   "#fp"                                        \n\t"   \
107    ".set at                                                        \n\t"
108
109#define MMI_LDXC1(fp, addr, stride, bias)                                   \
110    ".set noat                                                 \n\t"   \
111    PTR_ADDU    "$at,  "#addr",    "#stride"                   \n\t"   \
112    MMI_LDC1(fp, $at, bias)                                           \
113    ".set at                                                   \n\t"
114
115#define MMI_SDC1(fp, addr, bias)                                            \
116    "sdc1       "#fp",      "#bias"("#addr")                        \n\t"
117
118#define MMI_USDC1(fp, addr, bias)                                           \
119    ".set noat                                                      \n\t"   \
120    "dmfc1      $at,   "#fp"                                        \n\t"   \
121    "usd        $at,   "#bias"("#addr")                             \n\t"   \
122    ".set at                                                        \n\t"
123
124#define MMI_SDXC1(fp, addr, stride, bias)                                   \
125    ".set noat                                                 \n\t"   \
126    PTR_ADDU    "$at,  "#addr",    "#stride"                   \n\t"   \
127    MMI_SDC1(fp, $at, bias)                                            \
128    ".set at                                                   \n\t"
129
130#define MMI_LQ(reg1, reg2, addr, bias)                                      \
131    "ld         "#reg1",    "#bias"("#addr")                        \n\t"   \
132    "ld         "#reg2",  8+"#bias"("#addr")                        \n\t"
133
134#define MMI_SQ(reg1, reg2, addr, bias)                                      \
135    "sd         "#reg1",    "#bias"("#addr")                        \n\t"   \
136    "sd         "#reg2",  8+"#bias"("#addr")                        \n\t"
137
138#define MMI_LQC1(fp1, fp2, addr, bias)                                      \
139    "ldc1       "#fp1",     "#bias"("#addr")                        \n\t"   \
140    "ldc1       "#fp2",   8+"#bias"("#addr")                        \n\t"
141
142#define MMI_SQC1(fp1, fp2, addr, bias)                                      \
143    "sdc1       "#fp1",     "#bias"("#addr")                        \n\t"   \
144    "sdc1       "#fp2",   8+"#bias"("#addr")                        \n\t"
145
146#elif HAVE_LOONGSON3 /* !HAVE_LOONGSON2 */
147
148#define MMI_LWX(reg, addr, stride, bias)                                    \
149    "gslwx      "#reg",     "#bias"("#addr", "#stride")             \n\t"
150
151#define MMI_SWX(reg, addr, stride, bias)                                    \
152    "gsswx      "#reg",     "#bias"("#addr", "#stride")             \n\t"
153
154#define MMI_LDX(reg, addr, stride, bias)                                    \
155    "gsldx      "#reg",     "#bias"("#addr", "#stride")             \n\t"
156
157#define MMI_SDX(reg, addr, stride, bias)                                    \
158    "gssdx      "#reg",     "#bias"("#addr", "#stride")             \n\t"
159
160#define MMI_LWC1(fp, addr, bias)                                            \
161    "lwc1       "#fp",      "#bias"("#addr")                        \n\t"
162
163#if _MIPS_SIM == _ABIO32 /* workaround for 3A2000 gslwlc1 bug */
164
165#define MMI_LWLRC1(fp, addr, bias, off)                                     \
166    ".set noat                                                 \n\t"   \
167    "lwl        $at,   "#bias"+"#off"("#addr")                 \n\t"   \
168    "lwr        $at,   "#bias"("#addr")                        \n\t"   \
169    "mtc1       $at,   "#fp"                                   \n\t"   \
170    ".set at                                                   \n\t"
171
172#else /* _MIPS_SIM != _ABIO32 */
173
174#define DECLARE_VAR_LOW32
175#define RESTRICT_ASM_LOW32
176
177#define MMI_ULWC1(fp, addr, bias)                                           \
178    "gslwlc1    "#fp",    3+"#bias"("#addr")                        \n\t"   \
179    "gslwrc1    "#fp",      "#bias"("#addr")                        \n\t"
180
181#endif /* _MIPS_SIM != _ABIO32 */
182
183#define MMI_LWXC1(fp, addr, stride, bias)                                   \
184    "gslwxc1    "#fp",      "#bias"("#addr", "#stride")             \n\t"
185
186#define MMI_SWC1(fp, addr, bias)                                            \
187    "swc1       "#fp",      "#bias"("#addr")                        \n\t"
188
189#define MMI_USWC1(fp, addr, bias)                                           \
190    "gsswlc1    "#fp",    3+"#bias"("#addr")                        \n\t"   \
191    "gsswrc1    "#fp",      "#bias"("#addr")                        \n\t"
192
193#define MMI_SWXC1(fp, addr, stride, bias)                                   \
194    "gsswxc1    "#fp",      "#bias"("#addr", "#stride")             \n\t"
195
196#define MMI_LDC1(fp, addr, bias)                                            \
197    "ldc1       "#fp",      "#bias"("#addr")                        \n\t"
198
199#define MMI_ULDC1(fp, addr, bias)                                           \
200    "gsldlc1    "#fp",    7+"#bias"("#addr")                        \n\t"   \
201    "gsldrc1    "#fp",      "#bias"("#addr")                        \n\t"
202
203#define MMI_LDXC1(fp, addr, stride, bias)                                   \
204    "gsldxc1    "#fp",      "#bias"("#addr", "#stride")             \n\t"
205
206#define MMI_SDC1(fp, addr, bias)                                            \
207    "sdc1       "#fp",      "#bias"("#addr")                        \n\t"
208
209#define MMI_USDC1(fp, addr, bias)                                           \
210    "gssdlc1    "#fp",    7+"#bias"("#addr")                        \n\t"   \
211    "gssdrc1    "#fp",      "#bias"("#addr")                        \n\t"
212
213#define MMI_SDXC1(fp, addr, stride, bias)                                   \
214    "gssdxc1    "#fp",      "#bias"("#addr", "#stride")             \n\t"
215
216#define MMI_LQ(reg1, reg2, addr, bias)                                      \
217    "gslq       "#reg1",    "#reg2",     "#bias"("#addr")           \n\t"
218
219#define MMI_SQ(reg1, reg2, addr, bias)                                      \
220    "gssq       "#reg1",    "#reg2",     "#bias"("#addr")           \n\t"
221
222#define MMI_LQC1(fp1, fp2, addr, bias)                                      \
223    "gslqc1     "#fp1",     "#fp2",     "#bias"("#addr")            \n\t"
224
225#define MMI_SQC1(fp1, fp2, addr, bias)                                      \
226    "gssqc1     "#fp1",     "#fp2",     "#bias"("#addr")            \n\t"
227
228#endif /* HAVE_LOONGSON2 */
229
230/**
231 * Backup saved registers
232 * We're not using compiler's clobber list as it's not smart enough
233 * to take advantage of quad word load/store.
234 */
235#define BACKUP_REG \
236  LOCAL_ALIGNED_16(double, temp_backup_reg, [8]);               \
237  if (_MIPS_SIM == _ABI64)                                      \
238    __asm__ volatile (                                          \
239      MMI_SQC1($f25, $f24, %[temp], 0x00)                       \
240      MMI_SQC1($f27, $f26, %[temp], 0x10)                       \
241      MMI_SQC1($f29, $f28, %[temp], 0x20)                       \
242      MMI_SQC1($f31, $f30, %[temp], 0x30)                       \
243      :                                                         \
244      : [temp]"r"(temp_backup_reg)                              \
245      : "memory"                                                \
246    );                                                          \
247  else                                                          \
248    __asm__ volatile (                                          \
249      MMI_SQC1($f22, $f20, %[temp], 0x10)                       \
250      MMI_SQC1($f26, $f24, %[temp], 0x10)                       \
251      MMI_SQC1($f30, $f28, %[temp], 0x20)                       \
252      :                                                         \
253      : [temp]"r"(temp_backup_reg)                              \
254      : "memory"                                                \
255    );
256
257/**
258 * recover register
259 */
260#define RECOVER_REG \
261  if (_MIPS_SIM == _ABI64)                                      \
262    __asm__ volatile (                                          \
263      MMI_LQC1($f25, $f24, %[temp], 0x00)                       \
264      MMI_LQC1($f27, $f26, %[temp], 0x10)                       \
265      MMI_LQC1($f29, $f28, %[temp], 0x20)                       \
266      MMI_LQC1($f31, $f30, %[temp], 0x30)                       \
267      :                                                         \
268      : [temp]"r"(temp_backup_reg)                              \
269      : "memory"                                                \
270    );                                                          \
271  else                                                          \
272    __asm__ volatile (                                          \
273      MMI_LQC1($f22, $f20, %[temp], 0x10)                       \
274      MMI_LQC1($f26, $f24, %[temp], 0x10)                       \
275      MMI_LQC1($f30, $f28, %[temp], 0x20)                       \
276      :                                                         \
277      : [temp]"r"(temp_backup_reg)                              \
278      : "memory"                                                \
279    );
280
281/**
282 * brief: Transpose 2X2 word packaged data.
283 * fr_i0, fr_i1: src
284 * fr_o0, fr_o1: dst
285 */
286#define TRANSPOSE_2W(fr_i0, fr_i1, fr_o0, fr_o1)                          \
287        "punpcklwd  "#fr_o0",   "#fr_i0",   "#fr_i1"                \n\t" \
288        "punpckhwd  "#fr_o1",   "#fr_i0",   "#fr_i1"                \n\t"
289
290/**
291 * brief: Transpose 4X4 half word packaged data.
292 * fr_i0, fr_i1, fr_i2, fr_i3: src & dst
293 * fr_t0, fr_t1, fr_t2, fr_t3: temporary register
294 */
295#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3,                          \
296                     fr_t0, fr_t1, fr_t2, fr_t3)                          \
297        "punpcklhw  "#fr_t0",   "#fr_i0",   "#fr_i1"                \n\t" \
298        "punpckhhw  "#fr_t1",   "#fr_i0",   "#fr_i1"                \n\t" \
299        "punpcklhw  "#fr_t2",   "#fr_i2",   "#fr_i3"                \n\t" \
300        "punpckhhw  "#fr_t3",   "#fr_i2",   "#fr_i3"                \n\t" \
301        "punpcklwd  "#fr_i0",   "#fr_t0",   "#fr_t2"                \n\t" \
302        "punpckhwd  "#fr_i1",   "#fr_t0",   "#fr_t2"                \n\t" \
303        "punpcklwd  "#fr_i2",   "#fr_t1",   "#fr_t3"                \n\t" \
304        "punpckhwd  "#fr_i3",   "#fr_t1",   "#fr_t3"                \n\t"
305
306/**
307 * brief: Transpose 8x8 byte packaged data.
308 * fr_i0~i7: src & dst
309 * fr_t0~t3: temporary register
310 */
311#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5,            \
312                     fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)            \
313        "punpcklbh  "#fr_t0",   "#fr_i0",   "#fr_i1"                \n\t" \
314        "punpckhbh  "#fr_t1",   "#fr_i0",   "#fr_i1"                \n\t" \
315        "punpcklbh  "#fr_t2",   "#fr_i2",   "#fr_i3"                \n\t" \
316        "punpckhbh  "#fr_t3",   "#fr_i2",   "#fr_i3"                \n\t" \
317        "punpcklbh  "#fr_i0",   "#fr_i4",   "#fr_i5"                \n\t" \
318        "punpckhbh  "#fr_i1",   "#fr_i4",   "#fr_i5"                \n\t" \
319        "punpcklbh  "#fr_i2",   "#fr_i6",   "#fr_i7"                \n\t" \
320        "punpckhbh  "#fr_i3",   "#fr_i6",   "#fr_i7"                \n\t" \
321        "punpcklhw  "#fr_i4",   "#fr_t0",   "#fr_t2"                \n\t" \
322        "punpckhhw  "#fr_i5",   "#fr_t0",   "#fr_t2"                \n\t" \
323        "punpcklhw  "#fr_i6",   "#fr_t1",   "#fr_t3"                \n\t" \
324        "punpckhhw  "#fr_i7",   "#fr_t1",   "#fr_t3"                \n\t" \
325        "punpcklhw  "#fr_t0",   "#fr_i0",   "#fr_i2"                \n\t" \
326        "punpckhhw  "#fr_t1",   "#fr_i0",   "#fr_i2"                \n\t" \
327        "punpcklhw  "#fr_t2",   "#fr_i1",   "#fr_i3"                \n\t" \
328        "punpckhhw  "#fr_t3",   "#fr_i1",   "#fr_i3"                \n\t" \
329        "punpcklwd  "#fr_i0",   "#fr_i4",   "#fr_t0"                \n\t" \
330        "punpckhwd  "#fr_i1",   "#fr_i4",   "#fr_t0"                \n\t" \
331        "punpcklwd  "#fr_i2",   "#fr_i5",   "#fr_t1"                \n\t" \
332        "punpckhwd  "#fr_i3",   "#fr_i5",   "#fr_t1"                \n\t" \
333        "punpcklwd  "#fr_i4",   "#fr_i6",   "#fr_t2"                \n\t" \
334        "punpckhwd  "#fr_i5",   "#fr_i6",   "#fr_t2"                \n\t" \
335        "punpcklwd  "#fr_i6",   "#fr_i7",   "#fr_t3"                \n\t" \
336        "punpckhwd  "#fr_i7",   "#fr_i7",   "#fr_t3"                \n\t"
337
338/**
339 * brief: Parallel SRA for 8 byte packaged data.
340 * fr_i0: src
341 * fr_i1: SRA number(SRAB number + 8)
342 * fr_t0, fr_t1: temporary register
343 * fr_d0: dst
344 */
345#define PSRAB_MMI(fr_i0, fr_i1, fr_t0, fr_t1, fr_d0)                      \
346        "punpcklbh    "#fr_t0",   "#fr_t0",   "#fr_i0"              \n\t" \
347        "punpckhbh    "#fr_t1",   "#fr_t1",   "#fr_i0"              \n\t" \
348        "psrah        "#fr_t0",   "#fr_t0",   "#fr_i1"              \n\t" \
349        "psrah        "#fr_t1",   "#fr_t1",   "#fr_i1"              \n\t" \
350        "packsshb     "#fr_d0",   "#fr_t0",   "#fr_t1"              \n\t"
351
352/**
353 * brief: Parallel SRL for 8 byte packaged data.
354 * fr_i0: src
355 * fr_i1: SRL number(SRLB number + 8)
356 * fr_t0, fr_t1: temporary register
357 * fr_d0: dst
358 */
359#define PSRLB_MMI(fr_i0, fr_i1, fr_t0, fr_t1, fr_d0)                      \
360        "punpcklbh    "#fr_t0",   "#fr_t0",   "#fr_i0"              \n\t" \
361        "punpckhbh    "#fr_t1",   "#fr_t1",   "#fr_i0"              \n\t" \
362        "psrlh        "#fr_t0",   "#fr_t0",   "#fr_i1"              \n\t" \
363        "psrlh        "#fr_t1",   "#fr_t1",   "#fr_i1"              \n\t" \
364        "packsshb     "#fr_d0",   "#fr_t0",   "#fr_t1"              \n\t"
365
366#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                            \
367        "psrah      "#fp1",     "#fp1",     "#shift"                \n\t" \
368        "psrah      "#fp2",     "#fp2",     "#shift"                \n\t" \
369        "psrah      "#fp3",     "#fp3",     "#shift"                \n\t" \
370        "psrah      "#fp4",     "#fp4",     "#shift"                \n\t"
371
372#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift)        \
373        PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)                            \
374        PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift)
375
376/**
377 * brief: (((value) + (1 << ((n) - 1))) >> (n))
378 * fr_i0: src & dst
379 * fr_i1: Operand number
380 * fr_t0, fr_t1: temporary FPR
381 * gr_t0: temporary GPR
382 */
383#define ROUND_POWER_OF_TWO_MMI(fr_i0, fr_i1, fr_t0, fr_t1, gr_t0)         \
384        "li         "#gr_t0",     0x01                              \n\t" \
385        "dmtc1      "#gr_t0",     "#fr_t0"                          \n\t" \
386        "punpcklwd  "#fr_t0",     "#fr_t0",    "#fr_t0"             \n\t" \
387        "psubw      "#fr_t1",     "#fr_i1",    "#fr_t0"             \n\t" \
388        "psllw      "#fr_t1",     "#fr_t0",    "#fr_t1"             \n\t" \
389        "paddw      "#fr_i0",     "#fr_i0",    "#fr_t1"             \n\t" \
390        "psraw      "#fr_i0",     "#fr_i0",    "#fr_i1"             \n\t"
391
392#endif /* AVUTILS_MIPS_MMIUTILS_H */
393