1/*
2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22#define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23
24#include <stdint.h>
25#include <msa.h>
26#include <config.h>
27
28#define ALIGNMENT           16
29#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30
31#define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
32#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
33#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
34#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
35#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
36#define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
37#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
38
39#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
41#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
42#define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
43#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
44#define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
45#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
46
47#if (__mips_isa_rev >= 6)
48    #define LH(psrc)                              \
49    ( {                                           \
50        uint16_t val_lh_m = *(uint16_t *)(psrc);  \
51        val_lh_m;                                 \
52    } )
53
54    #define LW(psrc)                              \
55    ( {                                           \
56        uint32_t val_lw_m = *(uint32_t *)(psrc);  \
57        val_lw_m;                                 \
58    } )
59
60    #if (__mips == 64)
61        #define LD(psrc)                               \
62        ( {                                            \
63            uint64_t val_ld_m =  *(uint64_t *)(psrc);  \
64            val_ld_m;                                  \
65        } )
66    #else  // !(__mips == 64)
67        #define LD(psrc)                                                    \
68        ( {                                                                 \
69            uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
70            uint32_t val0_ld_m, val1_ld_m;                                  \
71            uint64_t val_ld_m = 0;                                          \
72                                                                            \
73            val0_ld_m = LW(psrc_ld_m);                                      \
74            val1_ld_m = LW(psrc_ld_m + 4);                                  \
75                                                                            \
76            val_ld_m = (uint64_t) (val1_ld_m);                              \
77            val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
78            val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
79                                                                            \
80            val_ld_m;                                                       \
81        } )
82    #endif  // (__mips == 64)
83
84    #define SH(val, pdst)  *(uint16_t *)(pdst) = (val);
85    #define SW(val, pdst)  *(uint32_t *)(pdst) = (val);
86    #define SD(val, pdst)  *(uint64_t *)(pdst) = (val);
87
88#else  // !(__mips_isa_rev >= 6)
89    #define LH(psrc)                                 \
90    ( {                                              \
91        uint8_t *psrc_lh_m = (uint8_t *) (psrc);     \
92        uint16_t val_lh_m;                           \
93                                                     \
94        __asm__ volatile (                           \
95            "ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t"  \
96                                                     \
97            : [val_lh_m] "=r" (val_lh_m)             \
98            : [psrc_lh_m] "m" (*psrc_lh_m)           \
99        );                                           \
100                                                     \
101        val_lh_m;                                    \
102    } )
103
104    #define LW(psrc)                                 \
105    ( {                                              \
106        uint8_t *psrc_lw_m = (uint8_t *) (psrc);     \
107        uint32_t val_lw_m;                           \
108                                                     \
109        __asm__ volatile (                           \
110            "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"  \
111            "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"  \
112                                                     \
113            : [val_lw_m] "=&r"(val_lw_m)             \
114            : [psrc_lw_m] "r"(psrc_lw_m)             \
115        );                                           \
116                                                     \
117        val_lw_m;                                    \
118    } )
119
120    #if (__mips == 64)
121        #define LD(psrc)                                 \
122        ( {                                              \
123            uint8_t *psrc_ld_m = (uint8_t *) (psrc);     \
124            uint64_t val_ld_m = 0;                       \
125                                                         \
126            __asm__ volatile (                           \
127                "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"  \
128                "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"  \
129                                                         \
130                : [val_ld_m] "=&r" (val_ld_m)            \
131                : [psrc_ld_m] "r" (psrc_ld_m)            \
132            );                                           \
133                                                         \
134            val_ld_m;                                    \
135        } )
136    #else  // !(__mips == 64)
137        #define LD(psrc)                                                    \
138        ( {                                                                 \
139            uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
140            uint32_t val0_ld_m, val1_ld_m;                                  \
141            uint64_t val_ld_m = 0;                                          \
142                                                                            \
143            val0_ld_m = LW(psrc_ld_m);                                      \
144            val1_ld_m = LW(psrc_ld_m + 4);                                  \
145                                                                            \
146            val_ld_m = (uint64_t) (val1_ld_m);                              \
147            val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
148            val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
149                                                                            \
150            val_ld_m;                                                       \
151        } )
152    #endif  // (__mips == 64)
153
154    #define SH(val, pdst)                            \
155    {                                                \
156        uint8_t *pdst_sh_m = (uint8_t *) (pdst);     \
157        uint16_t val_sh_m = (val);                   \
158                                                     \
159        __asm__ volatile (                           \
160            "ush  %[val_sh_m],  %[pdst_sh_m]  \n\t"  \
161                                                     \
162            : [pdst_sh_m] "=m" (*pdst_sh_m)          \
163            : [val_sh_m] "r" (val_sh_m)              \
164        );                                           \
165    }
166
167    #define SW(val, pdst)                            \
168    {                                                \
169        uint8_t *pdst_sw_m = (uint8_t *) (pdst);     \
170        uint32_t val_sw_m = (val);                   \
171                                                     \
172        __asm__ volatile (                           \
173            "usw  %[val_sw_m],  %[pdst_sw_m]  \n\t"  \
174                                                     \
175            : [pdst_sw_m] "=m" (*pdst_sw_m)          \
176            : [val_sw_m] "r" (val_sw_m)              \
177        );                                           \
178    }
179
180    #define SD(val, pdst)                                             \
181    {                                                                 \
182        uint8_t *pdst_sd_m = (uint8_t *) (pdst);                      \
183        uint32_t val0_sd_m, val1_sd_m;                                \
184                                                                      \
185        val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
186        val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
187                                                                      \
188        SW(val0_sd_m, pdst_sd_m);                                     \
189        SW(val1_sd_m, pdst_sd_m + 4);                                 \
190    }
191#endif // (__mips_isa_rev >= 6)
192
193/* Description : Load 4 words with stride
194   Arguments   : Inputs  - psrc    (source pointer to load from)
195                         - stride
196                 Outputs - out0, out1, out2, out3
197   Details     : Loads word in 'out0' from (psrc)
198                 Loads word in 'out1' from (psrc + stride)
199                 Loads word in 'out2' from (psrc + 2 * stride)
200                 Loads word in 'out3' from (psrc + 3 * stride)
201*/
202#define LW4(psrc, stride, out0, out1, out2, out3)  \
203{                                                  \
204    out0 = LW((psrc));                             \
205    out1 = LW((psrc) + stride);                    \
206    out2 = LW((psrc) + 2 * stride);                \
207    out3 = LW((psrc) + 3 * stride);                \
208}
209
210#define LW2(psrc, stride, out0, out1)  \
211{                                      \
212    out0 = LW((psrc));                 \
213    out1 = LW((psrc) + stride);        \
214}
215
216/* Description : Load double words with stride
217   Arguments   : Inputs  - psrc    (source pointer to load from)
218                         - stride
219                 Outputs - out0, out1
220   Details     : Loads double word in 'out0' from (psrc)
221                 Loads double word in 'out1' from (psrc + stride)
222*/
223#define LD2(psrc, stride, out0, out1)  \
224{                                      \
225    out0 = LD((psrc));                 \
226    out1 = LD((psrc) + stride);        \
227}
228#define LD4(psrc, stride, out0, out1, out2, out3)  \
229{                                                  \
230    LD2((psrc), stride, out0, out1);               \
231    LD2((psrc) + 2 * stride, stride, out2, out3);  \
232}
233
234/* Description : Store 4 words with stride
235   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
236   Details     : Stores word from 'in0' to (pdst)
237                 Stores word from 'in1' to (pdst + stride)
238                 Stores word from 'in2' to (pdst + 2 * stride)
239                 Stores word from 'in3' to (pdst + 3 * stride)
240*/
241#define SW4(in0, in1, in2, in3, pdst, stride)  \
242{                                              \
243    SW(in0, (pdst))                            \
244    SW(in1, (pdst) + stride);                  \
245    SW(in2, (pdst) + 2 * stride);              \
246    SW(in3, (pdst) + 3 * stride);              \
247}
248
249/* Description : Store 4 double words with stride
250   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
251   Details     : Stores double word from 'in0' to (pdst)
252                 Stores double word from 'in1' to (pdst + stride)
253                 Stores double word from 'in2' to (pdst + 2 * stride)
254                 Stores double word from 'in3' to (pdst + 3 * stride)
255*/
256#define SD4(in0, in1, in2, in3, pdst, stride)  \
257{                                              \
258    SD(in0, (pdst))                            \
259    SD(in1, (pdst) + stride);                  \
260    SD(in2, (pdst) + 2 * stride);              \
261    SD(in3, (pdst) + 3 * stride);              \
262}
263
264/* Description : Load vector elements with stride
265   Arguments   : Inputs  - psrc    (source pointer to load from)
266                         - stride
267                 Outputs - out0, out1
268                 Return Type - as per RTYPE
269   Details     : Loads elements in 'out0' from (psrc)
270                 Loads elements in 'out1' from (psrc + stride)
271*/
272#define LD_V2(RTYPE, psrc, stride, out0, out1)  \
273{                                               \
274    out0 = LD_V(RTYPE, (psrc));                 \
275    out1 = LD_V(RTYPE, (psrc) + stride);        \
276}
277#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
278#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
279#define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
280#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
281#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
282
283#define LD_V3(RTYPE, psrc, stride, out0, out1, out2)  \
284{                                                     \
285    LD_V2(RTYPE, (psrc), stride, out0, out1);         \
286    out2 = LD_V(RTYPE, (psrc) + 2 * stride);          \
287}
288#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
289#define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
290
291#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
292{                                                            \
293    LD_V2(RTYPE, (psrc), stride, out0, out1);                \
294    LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
295}
296#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
297#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
298#define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
299#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
300#define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
301
302#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
303{                                                                 \
304    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
305    out4 = LD_V(RTYPE, (psrc) + 4 * stride);                      \
306}
307#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
308#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
309
310#define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
311{                                                                       \
312    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
313    LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
314}
315#define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
316#define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
317#define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
318#define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
319
320#define LD_V7(RTYPE, psrc, stride,                               \
321              out0, out1, out2, out3, out4, out5, out6)          \
322{                                                                \
323    LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
324    LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
325}
326#define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
327#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
328
329#define LD_V8(RTYPE, psrc, stride,                                      \
330              out0, out1, out2, out3, out4, out5, out6, out7)           \
331{                                                                       \
332    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
333    LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
334}
335#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
336#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
337#define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
338#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
339#define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
340
341#define LD_V16(RTYPE, psrc, stride,                                   \
342               out0, out1, out2, out3, out4, out5, out6, out7,        \
343               out8, out9, out10, out11, out12, out13, out14, out15)  \
344{                                                                     \
345    LD_V8(RTYPE, (psrc), stride,                                      \
346          out0, out1, out2, out3, out4, out5, out6, out7);            \
347    LD_V8(RTYPE, (psrc) + 8 * stride, stride,                         \
348          out8, out9, out10, out11, out12, out13, out14, out15);      \
349}
350#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
351
352/* Description : Store vectors with stride
353   Arguments   : Inputs  - in0, in1, stride
354                 Outputs - pdst    (destination pointer to store to)
355   Details     : Stores elements from 'in0' to (pdst)
356                 Stores elements from 'in1' to (pdst + stride)
357*/
358#define ST_V2(RTYPE, in0, in1, pdst, stride)  \
359{                                             \
360    ST_V(RTYPE, in0, (pdst));                 \
361    ST_V(RTYPE, in1, (pdst) + stride);        \
362}
363#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
364#define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
365#define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
366#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
367#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
368
369#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
370{                                                         \
371    ST_V2(RTYPE, in0, in1, (pdst), stride);               \
372    ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
373}
374#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
375#define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
376#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
377#define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
378
379#define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
380{                                                                 \
381    ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
382    ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
383}
384#define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
385
386#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
387{                                                                           \
388    ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
389    ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
390}
391#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
392#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
393#define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
394
395/* Description : Store half word elements of vector with stride
396 * Arguments   : Inputs  - in   source vector
397 *                       - pdst    (destination pointer to store to)
398 *                       - stride
399 * Details     : Stores half word 'idx0' from 'in' to (pdst)
400 *               Stores half word 'idx1' from 'in' to (pdst + stride)
401 *               Similar for other elements
402 */
403#define ST_H1(in, idx, pdst)                             \
404{                                                        \
405    uint16_t out0_m;                                     \
406    out0_m = __msa_copy_u_h((v8i16) in, idx);            \
407    SH(out0_m, (pdst));                                  \
408}
409#define ST_H2(in, idx0, idx1, pdst, stride)              \
410{                                                        \
411    uint16_t out0_m, out1_m;                             \
412    out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
413    out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
414    SH(out0_m, (pdst));                                  \
415    SH(out1_m, (pdst) + stride);                         \
416}
417#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
418{                                                        \
419    uint16_t out0_m, out1_m, out2_m, out3_m;             \
420    out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
421    out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
422    out2_m = __msa_copy_u_h((v8i16) in, idx2);           \
423    out3_m = __msa_copy_u_h((v8i16) in, idx3);           \
424    SH(out0_m, (pdst));                                  \
425    SH(out1_m, (pdst) + stride);                         \
426    SH(out2_m, (pdst) + 2 * stride);                     \
427    SH(out3_m, (pdst) + 3 * stride);                     \
428}
429#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,            \
430              idx6, idx7, pdst, stride)                          \
431{                                                                \
432    ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)              \
433    ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
434}
435
436/* Description : Store word elements of vector with stride
437 * Arguments   : Inputs  - in   source vector
438 *                       - pdst    (destination pointer to store to)
439 *                       - stride
440 * Details     : Stores word 'idx0' from 'in' to (pdst)
441 *               Stores word 'idx1' from 'in' to (pdst + stride)
442 *               Similar for other elements
443 */
444#define ST_W1(in, idx, pdst)                             \
445{                                                        \
446    uint32_t out0_m;                                     \
447    out0_m = __msa_copy_u_w((v4i32) in, idx);            \
448    SW(out0_m, (pdst));                                  \
449}
450#define ST_W2(in, idx0, idx1, pdst, stride)              \
451{                                                        \
452    uint32_t out0_m, out1_m;                             \
453    out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
454    out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
455    SW(out0_m, (pdst));                                  \
456    SW(out1_m, (pdst) + stride);                         \
457}
458#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
459{                                                        \
460    uint32_t out0_m, out1_m, out2_m, out3_m;             \
461    out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
462    out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
463    out2_m = __msa_copy_u_w((v4i32) in, idx2);           \
464    out3_m = __msa_copy_u_w((v4i32) in, idx3);           \
465    SW(out0_m, (pdst));                                  \
466    SW(out1_m, (pdst) + stride);                         \
467    SW(out2_m, (pdst) + 2*stride);                       \
468    SW(out3_m, (pdst) + 3*stride);                       \
469}
470#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,                 \
471              idx4, idx5, idx6, idx7, pdst, stride)             \
472{                                                               \
473    ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride)            \
474    ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
475}
476
477/* Description : Store double word elements of vector with stride
478 * Arguments   : Inputs  - in   source vector
479 *                       - pdst    (destination pointer to store to)
480 *                       - stride
481 * Details     : Stores double word 'idx0' from 'in' to (pdst)
482 *               Stores double word 'idx1' from 'in' to (pdst + stride)
483 *               Similar for other elements
484 */
485#define ST_D1(in, idx, pdst)                   \
486{                                              \
487    uint64_t out0_m;                           \
488    out0_m = __msa_copy_u_d((v2i64) in, idx);  \
489    SD(out0_m, (pdst));                        \
490}
491#define ST_D2(in, idx0, idx1, pdst, stride)    \
492{                                              \
493    uint64_t out0_m, out1_m;                   \
494    out0_m = __msa_copy_u_d((v2i64) in, idx0); \
495    out1_m = __msa_copy_u_d((v2i64) in, idx1); \
496    SD(out0_m, (pdst));                        \
497    SD(out1_m, (pdst) + stride);               \
498}
499#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
500{                                                             \
501    uint64_t out0_m, out1_m, out2_m, out3_m;                  \
502    out0_m = __msa_copy_u_d((v2i64) in0, idx0);               \
503    out1_m = __msa_copy_u_d((v2i64) in0, idx1);               \
504    out2_m = __msa_copy_u_d((v2i64) in1, idx2);               \
505    out3_m = __msa_copy_u_d((v2i64) in1, idx3);               \
506    SD(out0_m, (pdst));                                       \
507    SD(out1_m, (pdst) + stride);                              \
508    SD(out2_m, (pdst) + 2 * stride);                          \
509    SD(out3_m, (pdst) + 3 * stride);                          \
510}
511#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,              \
512              idx4, idx5, idx6, idx7, pdst, stride)                    \
513{                                                                      \
514    ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)              \
515    ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
516}
517
518/* Description : Store as 12x8 byte block to destination memory from
519                 input vectors
520   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
521   Details     : Index 0 double word element from input vector 'in0' is copied
522                 and stored to destination memory at (pblk_12x8_m) followed by
523                 index 2 word element from same input vector 'in0' at
524                 (pblk_12x8_m + 8)
525                 Similar to remaining lines
526*/
527#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
528{                                                                        \
529    uint64_t out0_m, out1_m, out2_m, out3_m;                             \
530    uint64_t out4_m, out5_m, out6_m, out7_m;                             \
531    uint32_t out8_m, out9_m, out10_m, out11_m;                           \
532    uint32_t out12_m, out13_m, out14_m, out15_m;                         \
533    uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
534                                                                         \
535    out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
536    out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
537    out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
538    out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
539    out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
540    out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
541    out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
542    out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
543                                                                         \
544    out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
545    out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
546    out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
547    out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
548    out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
549    out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
550    out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
551    out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
552                                                                         \
553    SD(out0_m, pblk_12x8_m);                                             \
554    SW(out8_m, pblk_12x8_m + 8);                                         \
555    pblk_12x8_m += stride;                                               \
556    SD(out1_m, pblk_12x8_m);                                             \
557    SW(out9_m, pblk_12x8_m + 8);                                         \
558    pblk_12x8_m += stride;                                               \
559    SD(out2_m, pblk_12x8_m);                                             \
560    SW(out10_m, pblk_12x8_m + 8);                                        \
561    pblk_12x8_m += stride;                                               \
562    SD(out3_m, pblk_12x8_m);                                             \
563    SW(out11_m, pblk_12x8_m + 8);                                        \
564    pblk_12x8_m += stride;                                               \
565    SD(out4_m, pblk_12x8_m);                                             \
566    SW(out12_m, pblk_12x8_m + 8);                                        \
567    pblk_12x8_m += stride;                                               \
568    SD(out5_m, pblk_12x8_m);                                             \
569    SW(out13_m, pblk_12x8_m + 8);                                        \
570    pblk_12x8_m += stride;                                               \
571    SD(out6_m, pblk_12x8_m);                                             \
572    SW(out14_m, pblk_12x8_m + 8);                                        \
573    pblk_12x8_m += stride;                                               \
574    SD(out7_m, pblk_12x8_m);                                             \
575    SW(out15_m, pblk_12x8_m + 8);                                        \
576}
577
578/* Description : average with rounding (in0 + in1 + 1) / 2.
579   Arguments   : Inputs  - in0, in1, in2, in3,
580                 Outputs - out0, out1
581                 Return Type - as per RTYPE
582   Details     : Each byte element from 'in0' vector is added with each byte
583                 element from 'in1' vector. The addition of the elements plus 1
584                (for rounding) is done unsigned with full precision,
585                i.e. the result has one extra bit. Unsigned division by 2
586                (or logical shift right by one bit) is performed before writing
587                the result to vector 'out0'
588                Similar for the pair of 'in2' and 'in3'
589*/
590#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)       \
591{                                                             \
592    out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1);  \
593    out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3);  \
594}
595#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
596
597#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
598                 out0, out1, out2, out3)                        \
599{                                                               \
600    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)             \
601    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)             \
602}
603#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
604
605/* Description : Immediate number of columns to slide
606   Arguments   : Inputs  - s, d, slide_val
607                 Outputs - out
608                 Return Type - as per RTYPE
609   Details     : Byte elements from 'd' vector are slide into 's' by
610                 number of elements specified by 'slide_val'
611*/
612#define SLDI_B(RTYPE, d, s, slide_val, out)                       \
613{                                                                 \
614    out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val);  \
615}
616
617#define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
618{                                                              \
619    SLDI_B(RTYPE, d0, s0, slide_val, out0)                     \
620    SLDI_B(RTYPE, d1, s1, slide_val, out1)                     \
621}
622#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
623#define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
624#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
625#define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
626
627#define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val,  \
628                out0, out1, out2)                          \
629{                                                          \
630    SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
631    SLDI_B(RTYPE, d2, s2, slide_val, out2)                 \
632}
633#define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
634#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
635#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
636
637#define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3,     \
638                slide_val, out0, out1, out2, out3)         \
639{                                                          \
640    SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
641    SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3)  \
642}
643#define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
644#define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
645#define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
646
647/* Description : Shuffle byte vector elements as per mask vector
648   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
649                 Outputs - out0, out1
650                 Return Type - as per RTYPE
651   Details     : Selective byte elements from in0 & in1 are copied to out0 as
652                 per control vector mask0
653                 Selective byte elements from in2 & in3 are copied to out1 as
654                 per control vector mask1
655*/
656#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
657{                                                                          \
658    out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
659    out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
660}
661#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
662#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
663#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
664#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
665
666#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
667                out0, out1, out2)                                          \
668{                                                                          \
669    VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
670    out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
671}
672#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
673
674#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
675                out0, out1, out2, out3)                            \
676{                                                                  \
677    VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
678    VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
679}
680#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
681#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
682
683/* Description : Shuffle halfword vector elements as per mask vector
684   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
685                 Outputs - out0, out1
686                 Return Type - as per RTYPE
687   Details     : Selective halfword elements from in0 & in1 are copied to out0
688                 as per control vector mask0
689                 Selective halfword elements from in2 & in3 are copied to out1
690                 as per control vector mask1
691*/
692#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
693{                                                                          \
694    out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0);  \
695    out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2);  \
696}
697#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
698
699#define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
700                out0, out1, out2)                                          \
701{                                                                          \
702    VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
703    out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4);  \
704}
705#define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
706
707/* Description : Shuffle byte vector elements as per mask vector
708   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
709                 Outputs - out0, out1
710                 Return Type - as per RTYPE
711   Details     : Selective byte elements from in0 & in1 are copied to out0 as
712                 per control vector mask0
713                 Selective byte elements from in2 & in3 are copied to out1 as
714                 per control vector mask1
715*/
716#define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
717{                                                                         \
718    out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
719    out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
720}
721#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
722
723/* Description : Dot product of byte vector elements
724   Arguments   : Inputs  - mult0, mult1
725                           cnst0, cnst1
726                 Outputs - out0, out1
727                 Return Type - as per RTYPE
728   Details     : Unsigned byte elements from mult0 are multiplied with
729                 unsigned byte elements from cnst0 producing a result
730                 twice the size of input i.e. unsigned halfword.
731                 Then this multiplication results of adjacent odd-even elements
732                 are added together and stored to the out vector
733                 (2 unsigned halfword results)
734*/
735#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
736{                                                                 \
737    out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0);  \
738    out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1);  \
739}
740#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
741
742#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
743                 cnst0, cnst1, cnst2, cnst3,                  \
744                 out0, out1, out2, out3)                      \
745{                                                             \
746    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
747    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
748}
749#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
750
751/* Description : Dot product of byte vector elements
752   Arguments   : Inputs  - mult0, mult1
753                           cnst0, cnst1
754                 Outputs - out0, out1
755                 Return Type - as per RTYPE
756   Details     : Signed byte elements from mult0 are multiplied with
757                 signed byte elements from cnst0 producing a result
758                 twice the size of input i.e. signed halfword.
759                 Then this multiplication results of adjacent odd-even elements
760                 are added together and stored to the out vector
761                 (2 signed halfword results)
762*/
763#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
764{                                                                 \
765    out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
766    out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
767}
768#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
769
770#define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
771                 out0, out1, out2)                                 \
772{                                                                  \
773    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
774    out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
775}
776#define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
777
778#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
779                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
780{                                                                     \
781    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
782    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
783}
784#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
785
786/* Description : Dot product of halfword vector elements
787   Arguments   : Inputs  - mult0, mult1
788                           cnst0, cnst1
789                 Outputs - out0, out1
790                 Return Type - as per RTYPE
791   Details     : Signed halfword elements from mult0 are multiplied with
792                 signed halfword elements from cnst0 producing a result
793                 twice the size of input i.e. signed word.
794                 Then this multiplication results of adjacent odd-even elements
795                 are added together and stored to the out vector
796                 (2 signed word results)
797*/
798#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
799{                                                                 \
800    out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
801    out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
802}
803#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
804
805#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
806                 cnst0, cnst1, cnst2, cnst3,                  \
807                 out0, out1, out2, out3)                      \
808{                                                             \
809    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
810    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
811}
812#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
813
814/* Description : Dot product & addition of byte vector elements
815   Arguments   : Inputs  - mult0, mult1
816                           cnst0, cnst1
817                 Outputs - out0, out1
818                 Return Type - as per RTYPE
819   Details     : Signed byte elements from mult0 are multiplied with
820                 signed byte elements from cnst0 producing a result
821                 twice the size of input i.e. signed halfword.
822                 Then this multiplication results of adjacent odd-even elements
823                 are added to the out vector
824                 (2 signed halfword results)
825*/
826#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
827{                                                                  \
828    out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
829                                   (v16i8) mult0, (v16i8) cnst0);  \
830    out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
831                                   (v16i8) mult1, (v16i8) cnst1);  \
832}
833#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
834
835#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
836                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
837{                                                                      \
838    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
839    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
840}
841#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
842
843/* Description : Dot product & addition of byte vector elements
844   Arguments   : Inputs  - mult0, mult1
845                           cnst0, cnst1
846                 Outputs - out0, out1
847                 Return Type - as per RTYPE
848   Details     : Unsigned byte elements from mult0 are multiplied with
849                 unsigned byte elements from cnst0 producing a result
850                 twice the size of input i.e. unsigned halfword.
851                 Then this multiplication results of adjacent odd-even elements
852                 are added to the out vector
853                 (2 unsigned halfword results)
854*/
855#define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
856{                                                                  \
857    out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0,                   \
858                                   (v16u8) mult0, (v16u8) cnst0);  \
859    out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1,                   \
860                                   (v16u8) mult1, (v16u8) cnst1);  \
861}
862#define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
863
864/* Description : Dot product & addition of halfword vector elements
865   Arguments   : Inputs  - mult0, mult1
866                           cnst0, cnst1
867                 Outputs - out0, out1
868                 Return Type - as per RTYPE
869   Details     : Signed halfword elements from mult0 are multiplied with
870                 signed halfword elements from cnst0 producing a result
871                 twice the size of input i.e. signed word.
872                 Then this multiplication results of adjacent odd-even elements
873                 are added to the out vector
874                 (2 signed word results)
875*/
876#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
877{                                                                  \
878    out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
879                                   (v8i16) mult0, (v8i16) cnst0);  \
880    out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
881                                   (v8i16) mult1, (v8i16) cnst1);  \
882}
883#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
884
885#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
886                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
887{                                                                      \
888    DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
889    DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
890}
891#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
892
893/* Description : Minimum values between unsigned elements of
894                 either vector are copied to the output vector
895   Arguments   : Inputs  - in0, in1, min_vec
896                 Outputs - in0, in1, (in place)
897                 Return Type - as per RTYPE
898   Details     : Minimum of unsigned halfword element values from 'in0' and
899                 'min_value' are written to output vector 'in0'
900*/
901#define MIN_UH2(RTYPE, in0, in1, min_vec)               \
902{                                                       \
903    in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec);  \
904    in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec);  \
905}
906#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
907
908#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec)  \
909{                                                    \
910    MIN_UH2(RTYPE, in0, in1, min_vec);               \
911    MIN_UH2(RTYPE, in2, in3, min_vec);               \
912}
913#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
914
915/* Description : Clips all halfword elements of input vector between min & max
916                 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
917   Arguments   : Inputs  - in    (input vector)
918                         - min   (min threshold)
919                         - max   (max threshold)
920                 Outputs - in    (output vector with clipped elements)
921                 Return Type - signed halfword
922*/
923#define CLIP_SH(in, min, max)                     \
924{                                                 \
925    in = __msa_max_s_h((v8i16) min, (v8i16) in);  \
926    in = __msa_min_s_h((v8i16) max, (v8i16) in);  \
927}
928
929/* Description : Clips all signed halfword elements of input vector
930                 between 0 & 255
931   Arguments   : Inputs  - in    (input vector)
932                 Outputs - in    (output vector with clipped elements)
933                 Return Type - signed halfwords
934*/
935#define CLIP_SH_0_255(in)                       \
936{                                               \
937    in = __msa_maxi_s_h((v8i16) in, 0);         \
938    in = (v8i16) __msa_sat_u_h((v8u16) in, 7);  \
939}
940
941#define CLIP_SH2_0_255(in0, in1)  \
942{                                 \
943    CLIP_SH_0_255(in0);           \
944    CLIP_SH_0_255(in1);           \
945}
946
947#define CLIP_SH4_0_255(in0, in1, in2, in3)  \
948{                                           \
949    CLIP_SH2_0_255(in0, in1);               \
950    CLIP_SH2_0_255(in2, in3);               \
951}
952
953#define CLIP_SH8_0_255(in0, in1, in2, in3,  \
954                       in4, in5, in6, in7)  \
955{                                           \
956    CLIP_SH4_0_255(in0, in1, in2, in3);     \
957    CLIP_SH4_0_255(in4, in5, in6, in7);     \
958}
959
960/* Description : Clips all signed word elements of input vector
961                 between 0 & 255
962   Arguments   : Inputs  - in    (input vector)
963                 Outputs - in    (output vector with clipped elements)
964                 Return Type - signed word
965*/
966#define CLIP_SW_0_255(in)                       \
967{                                               \
968    in = __msa_maxi_s_w((v4i32) in, 0);         \
969    in = (v4i32) __msa_sat_u_w((v4u32) in, 7);  \
970}
971
972#define CLIP_SW2_0_255(in0, in1)  \
973{                                 \
974    CLIP_SW_0_255(in0);           \
975    CLIP_SW_0_255(in1);           \
976}
977
978#define CLIP_SW4_0_255(in0, in1, in2, in3)  \
979{                                           \
980    CLIP_SW2_0_255(in0, in1);               \
981    CLIP_SW2_0_255(in2, in3);               \
982}
983
984#define CLIP_SW8_0_255(in0, in1, in2, in3,  \
985                       in4, in5, in6, in7)  \
986{                                           \
987    CLIP_SW4_0_255(in0, in1, in2, in3);     \
988    CLIP_SW4_0_255(in4, in5, in6, in7);     \
989}
990
991/* Description : Addition of 4 signed word elements
992                 4 signed word elements of input vector are added together and
993                 resulted integer sum is returned
994   Arguments   : Inputs  - in       (signed word vector)
995                 Outputs - sum_m    (i32 sum)
996                 Return Type - signed word
997*/
998#define HADD_SW_S32(in)                               \
999( {                                                   \
1000    v2i64 res0_m, res1_m;                             \
1001    int32_t sum_m;                                    \
1002                                                      \
1003    res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in);  \
1004    res1_m = __msa_splati_d(res0_m, 1);               \
1005    res0_m += res1_m;                                 \
1006    sum_m = __msa_copy_s_w((v4i32) res0_m, 0);        \
1007    sum_m;                                            \
1008} )
1009
1010/* Description : Addition of 8 unsigned halfword elements
1011                 8 unsigned halfword elements of input vector are added
1012                 together and resulted integer sum is returned
1013   Arguments   : Inputs  - in       (unsigned halfword vector)
1014                 Outputs - sum_m    (u32 sum)
1015                 Return Type - unsigned word
1016*/
1017#define HADD_UH_U32(in)                                  \
1018( {                                                      \
1019    v4u32 res_m;                                         \
1020    v2u64 res0_m, res1_m;                                \
1021    uint32_t sum_m;                                      \
1022                                                         \
1023    res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in);      \
1024    res0_m = __msa_hadd_u_d(res_m, res_m);               \
1025    res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1);  \
1026    res0_m += res1_m;                                    \
1027    sum_m = __msa_copy_u_w((v4i32) res0_m, 0);           \
1028    sum_m;                                               \
1029} )
1030
1031/* Description : Horizontal addition of signed byte vector elements
1032   Arguments   : Inputs  - in0, in1
1033                 Outputs - out0, out1
1034                 Return Type - as per RTYPE
1035   Details     : Each signed odd byte element from 'in0' is added to
1036                 even signed byte element from 'in0' (pairwise) and the
1037                 halfword result is stored in 'out0'
1038*/
1039#define HADD_SB2(RTYPE, in0, in1, out0, out1)                 \
1040{                                                             \
1041    out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0);  \
1042    out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1);  \
1043}
1044#define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1045
1046#define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1047{                                                                    \
1048    HADD_SB2(RTYPE, in0, in1, out0, out1);                           \
1049    HADD_SB2(RTYPE, in2, in3, out2, out3);                           \
1050}
1051#define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1052#define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1053
1054/* Description : Horizontal addition of unsigned byte vector elements
1055   Arguments   : Inputs  - in0, in1
1056                 Outputs - out0, out1
1057                 Return Type - as per RTYPE
1058   Details     : Each unsigned odd byte element from 'in0' is added to
1059                 even unsigned byte element from 'in0' (pairwise) and the
1060                 halfword result is stored in 'out0'
1061*/
1062#define HADD_UB2(RTYPE, in0, in1, out0, out1)                 \
1063{                                                             \
1064    out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0);  \
1065    out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1);  \
1066}
1067#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1068
1069#define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
1070{                                                             \
1071    HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
1072    out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
1073}
1074#define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1075
1076#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1077{                                                                    \
1078    HADD_UB2(RTYPE, in0, in1, out0, out1);                           \
1079    HADD_UB2(RTYPE, in2, in3, out2, out3);                           \
1080}
1081#define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1082#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1083#define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1084
1085/* Description : Horizontal subtraction of unsigned byte vector elements
1086   Arguments   : Inputs  - in0, in1
1087                 Outputs - out0, out1
1088                 Return Type - as per RTYPE
1089   Details     : Each unsigned odd byte element from 'in0' is subtracted from
1090                 even unsigned byte element from 'in0' (pairwise) and the
1091                 halfword result is stored in 'out0'
1092*/
1093#define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
1094{                                                             \
1095    out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
1096    out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
1097}
1098#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1099#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1100
1101#define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1102{                                                                    \
1103    HSUB_UB2(RTYPE, in0, in1, out0, out1);                           \
1104    HSUB_UB2(RTYPE, in2, in3, out2, out3);                           \
1105}
1106#define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1107#define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1108
1109/* Description : SAD (Sum of Absolute Difference)
1110   Arguments   : Inputs  - in0, in1, ref0, ref1  (unsigned byte src & ref)
1111                 Outputs - sad_m                 (halfword vector with sad)
1112                 Return Type - unsigned halfword
1113   Details     : Absolute difference of all the byte elements from 'in0' with
1114                 'ref0' is calculated and preserved in 'diff0'. From the 16
1115                 unsigned absolute diff values, even-odd pairs are added
1116                 together to generate 8 halfword results.
1117*/
1118#define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
1119( {                                                             \
1120    v16u8 diff0_m, diff1_m;                                     \
1121    v8u16 sad_m = { 0 };                                        \
1122                                                                \
1123    diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \
1124    diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \
1125                                                                \
1126    sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \
1127    sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \
1128                                                                \
1129    sad_m;                                                      \
1130} )
1131
1132/* Description : Insert specified word elements from input vectors to 1
1133                 destination vector
1134   Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
1135                 Outputs - out                (output vector)
1136                 Return Type - as per RTYPE
1137*/
1138#define INSERT_W2(RTYPE, in0, in1, out)                 \
1139{                                                       \
1140    out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1141    out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1142}
1143#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1144#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1145
1146#define INSERT_W4(RTYPE, in0, in1, in2, in3, out)       \
1147{                                                       \
1148    out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1149    out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1150    out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2);  \
1151    out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3);  \
1152}
1153#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1154#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1155#define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1156#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1157
1158/* Description : Insert specified double word elements from input vectors to 1
1159                 destination vector
1160   Arguments   : Inputs  - in0, in1      (2 input vectors)
1161                 Outputs - out           (output vector)
1162                 Return Type - as per RTYPE
1163*/
1164#define INSERT_D2(RTYPE, in0, in1, out)                 \
1165{                                                       \
1166    out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0);  \
1167    out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1);  \
1168}
1169#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1170#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1171#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1172#define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1173
1174/* Description : Interleave even byte elements from vectors
1175   Arguments   : Inputs  - in0, in1, in2, in3
1176                 Outputs - out0, out1
1177                 Return Type - as per RTYPE
1178   Details     : Even byte elements of 'in0' and even byte
1179                 elements of 'in1' are interleaved and copied to 'out0'
1180                 Even byte elements of 'in2' and even byte
1181                 elements of 'in3' are interleaved and copied to 'out1'
1182*/
1183#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1184{                                                            \
1185    out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0);  \
1186    out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2);  \
1187}
1188#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1189#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1190#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1191#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1192
1193/* Description : Interleave even halfword elements from vectors
1194   Arguments   : Inputs  - in0, in1, in2, in3
1195                 Outputs - out0, out1
1196                 Return Type - as per RTYPE
1197   Details     : Even halfword elements of 'in0' and even halfword
1198                 elements of 'in1' are interleaved and copied to 'out0'
1199                 Even halfword elements of 'in2' and even halfword
1200                 elements of 'in3' are interleaved and copied to 'out1'
1201*/
1202#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1203{                                                            \
1204    out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
1205    out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
1206}
1207#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1208#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1209#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1210
1211/* Description : Interleave even word elements from vectors
1212   Arguments   : Inputs  - in0, in1, in2, in3
1213                 Outputs - out0, out1
1214                 Return Type - as per RTYPE
1215   Details     : Even word elements of 'in0' and even word
1216                 elements of 'in1' are interleaved and copied to 'out0'
1217                 Even word elements of 'in2' and even word
1218                 elements of 'in3' are interleaved and copied to 'out1'
1219*/
1220#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1221{                                                            \
1222    out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
1223    out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
1224}
1225#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1226#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1227#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1228#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1229
1230/* Description : Interleave even double word elements from vectors
1231   Arguments   : Inputs  - in0, in1, in2, in3
1232                 Outputs - out0, out1
1233                 Return Type - as per RTYPE
1234   Details     : Even double word elements of 'in0' and even double word
1235                 elements of 'in1' are interleaved and copied to 'out0'
1236                 Even double word elements of 'in2' and even double word
1237                 elements of 'in3' are interleaved and copied to 'out1'
1238*/
1239#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1240{                                                            \
1241    out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
1242    out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
1243}
1244#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1245#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1246#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1247
1248/* Description : Interleave left half of byte elements from vectors
1249   Arguments   : Inputs  - in0, in1, in2, in3
1250                 Outputs - out0, out1
1251                 Return Type - as per RTYPE
1252   Details     : Left half of byte elements of in0 and left half of byte
1253                 elements of in1 are interleaved and copied to out0.
1254                 Left half of byte elements of in2 and left half of byte
1255                 elements of in3 are interleaved and copied to out1.
1256*/
1257#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1258{                                                           \
1259    out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1260    out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
1261}
1262#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1263#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1264#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1265#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1266
1267#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1268                out0, out1, out2, out3)                         \
1269{                                                               \
1270    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1271    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1272}
1273#define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1274#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1275#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1276#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1277
1278/* Description : Interleave left half of halfword elements from vectors
1279   Arguments   : Inputs  - in0, in1, in2, in3
1280                 Outputs - out0, out1
1281                 Return Type - as per RTYPE
1282   Details     : Left half of halfword elements of in0 and left half of halfword
1283                 elements of in1 are interleaved and copied to out0.
1284                 Left half of halfword elements of in2 and left half of halfword
1285                 elements of in3 are interleaved and copied to out1.
1286*/
1287#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1288{                                                           \
1289    out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1290    out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1291}
1292#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1293#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1294
1295#define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1296                out0, out1, out2, out3)                         \
1297{                                                               \
1298    ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1299    ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1300}
1301#define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1302#define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1303
1304/* Description : Interleave left half of word elements from vectors
1305   Arguments   : Inputs  - in0, in1, in2, in3
1306                 Outputs - out0, out1
1307                 Return Type - as per RTYPE
1308   Details     : Left half of word elements of in0 and left half of word
1309                 elements of in1 are interleaved and copied to out0.
1310                 Left half of word elements of in2 and left half of word
1311                 elements of in3 are interleaved and copied to out1.
1312*/
1313#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1314{                                                           \
1315    out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1316    out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1317}
1318#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1319#define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1320#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1321
1322/* Description : Interleave right half of byte elements from vectors
1323   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1324                 Outputs - out0, out1, out2, out3
1325                 Return Type - as per RTYPE
1326   Details     : Right half of byte elements of in0 and right half of byte
1327                 elements of in1 are interleaved and copied to out0.
1328                 Right half of byte elements of in2 and right half of byte
1329                 elements of in3 are interleaved and copied to out1.
1330                 Similar for other pairs
1331*/
1332#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1333{                                                           \
1334    out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1335    out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1336}
1337#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1338#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1339#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1340#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1341#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1342
1343#define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1344{                                                                       \
1345    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1346    out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
1347}
1348#define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1349#define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1350#define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1351#define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1352
1353#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1354                out0, out1, out2, out3)                         \
1355{                                                               \
1356    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1357    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1358}
1359#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1360#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1361#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1362#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1363#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1364
1365#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1366                in8, in9, in10, in11, in12, in13, in14, in15,     \
1367                out0, out1, out2, out3, out4, out5, out6, out7)   \
1368{                                                                 \
1369    ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
1370            out0, out1, out2, out3);                              \
1371    ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,  \
1372            out4, out5, out6, out7);                              \
1373}
1374#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1375#define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1376
1377/* Description : Interleave right half of halfword elements from vectors
1378   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1379                 Outputs - out0, out1, out2, out3
1380                 Return Type - as per RTYPE
1381   Details     : Right half of halfword elements of in0 and right half of
1382                 halfword elements of in1 are interleaved and copied to out0.
1383                 Right half of halfword elements of in2 and right half of
1384                 halfword elements of in3 are interleaved and copied to out1.
1385                 Similar for other pairs
1386*/
1387#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1388{                                                           \
1389    out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1390    out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1391}
1392#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1393#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1394
1395#define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1396{                                                                       \
1397    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1398    out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1399}
1400#define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1401
1402#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1403                out0, out1, out2, out3)                         \
1404{                                                               \
1405    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1406    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1407}
1408#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1409#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1410
1411#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1412{                                                           \
1413    out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1414    out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1415}
1416#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1417#define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1418#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1419
1420#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1421                out0, out1, out2, out3)                         \
1422{                                                               \
1423    ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1424    ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1425}
1426#define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1427#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1428
1429/* Description : Interleave right half of double word elements from vectors
1430   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1431                 Outputs - out0, out1, out2, out3
1432                 Return Type - as per RTYPE
1433   Details     : Right half of double word elements of in0 and right half of
1434                 double word elements of in1 are interleaved and copied to out0.
1435                 Right half of double word elements of in2 and right half of
1436                 double word elements of in3 are interleaved and copied to out1.
1437*/
1438#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1439{                                                           \
1440    out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
1441    out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3);  \
1442}
1443#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1444#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1445#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1446
1447#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1448{                                                                       \
1449    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1450    out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5);              \
1451}
1452#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1453
1454#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1455                out0, out1, out2, out3)                         \
1456{                                                               \
1457    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1458    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1459}
1460#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1461#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1462
1463/* Description : Interleave left half of double word elements from vectors
1464   Arguments   : Inputs  - in0, in1, in2, in3
1465                 Outputs - out0, out1
1466                 Return Type - as per RTYPE
1467   Details     : Left half of double word elements of in0 and left half of
1468                 double word elements of in1 are interleaved and copied to out0.
1469                 Left half of double word elements of in2 and left half of
1470                 double word elements of in3 are interleaved and copied to out1.
1471*/
1472#define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1473{                                                           \
1474    out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
1475    out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3);  \
1476}
1477#define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1478#define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1479#define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1480
1481/* Description : Interleave both left and right half of input vectors
1482   Arguments   : Inputs  - in0, in1
1483                 Outputs - out0, out1
1484                 Return Type - as per RTYPE
1485   Details     : Right half of byte elements from 'in0' and 'in1' are
1486                 interleaved and stored to 'out0'
1487                 Left half of byte elements from 'in0' and 'in1' are
1488                 interleaved and stored to 'out1'
1489*/
1490#define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1491{                                                           \
1492    out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1493    out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1494}
1495#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1496#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1497#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1498#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1499#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1500
1501#define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1502{                                                           \
1503    out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1504    out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1505}
1506#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1507#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1508#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1509#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1510
1511#define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1512{                                                           \
1513    out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1514    out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1515}
1516#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1517#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1518#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1519
1520/* Description : Maximum values between signed elements of vector and
1521                 5-bit signed immediate value are copied to the output vector
1522   Arguments   : Inputs  - in0, in1, in2, in3, max_val
1523                 Outputs - in0, in1, in2, in3 (in place)
1524                 Return Type - as per RTYPE
1525   Details     : Maximum of signed halfword element values from 'in0' and
1526                 'max_val' are written to output vector 'in0'
1527*/
1528#define MAXI_SH2(RTYPE, in0, in1, max_val)               \
1529{                                                        \
1530    in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val);  \
1531    in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val);  \
1532}
1533#define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1534#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1535
1536#define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1537{                                                     \
1538    MAXI_SH2(RTYPE, in0, in1, max_val);               \
1539    MAXI_SH2(RTYPE, in2, in3, max_val);               \
1540}
1541#define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1542#define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1543
1544#define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val)  \
1545{                                                                         \
1546    MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val);                         \
1547    MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val);                         \
1548}
1549#define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1550#define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1551
1552/* Description : Saturate the halfword element values to the max
1553                 unsigned value of (sat_val+1 bits)
1554                 The element data width remains unchanged
1555   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1556                 Outputs - in0, in1, in2, in3 (in place)
1557                 Return Type - as per RTYPE
1558   Details     : Each unsigned halfword element from 'in0' is saturated to the
1559                 value generated with (sat_val+1) bit range
1560                 Results are in placed to original vectors
1561*/
1562#define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1563{                                                       \
1564    in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1565    in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1566}
1567#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1568#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1569
1570#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1571{                                                    \
1572    SAT_UH2(RTYPE, in0, in1, sat_val);               \
1573    SAT_UH2(RTYPE, in2, in3, sat_val);               \
1574}
1575#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1576#define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1577
1578#define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val)  \
1579{                                                                        \
1580    SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val);                         \
1581    SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val);                         \
1582}
1583#define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1584#define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1585
1586/* Description : Saturate the halfword element values to the max
1587                 unsigned value of (sat_val+1 bits)
1588                 The element data width remains unchanged
1589   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1590                 Outputs - in0, in1, in2, in3 (in place)
1591                 Return Type - as per RTYPE
1592   Details     : Each unsigned halfword element from 'in0' is saturated to the
1593                 value generated with (sat_val+1) bit range
1594                 Results are in placed to original vectors
1595*/
1596#define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1597{                                                       \
1598    in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1599    in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1600}
1601#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1602
1603#define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1604{                                                       \
1605    SAT_SH2(RTYPE, in0, in1, sat_val);                  \
1606    in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1607}
1608#define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1609
1610#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1611{                                                    \
1612    SAT_SH2(RTYPE, in0, in1, sat_val);               \
1613    SAT_SH2(RTYPE, in2, in3, sat_val);               \
1614}
1615#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1616
1617/* Description : Saturate the word element values to the max
1618                 unsigned value of (sat_val+1 bits)
1619                 The element data width remains unchanged
1620   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1621                 Outputs - in0, in1, in2, in3 (in place)
1622                 Return Type - as per RTYPE
1623   Details     : Each unsigned word element from 'in0' is saturated to the
1624                 value generated with (sat_val+1) bit range
1625                 Results are in placed to original vectors
1626*/
1627#define SAT_SW2(RTYPE, in0, in1, sat_val)               \
1628{                                                       \
1629    in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val);  \
1630    in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val);  \
1631}
1632#define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1633
1634#define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val)  \
1635{                                                    \
1636    SAT_SW2(RTYPE, in0, in1, sat_val);               \
1637    SAT_SW2(RTYPE, in2, in3, sat_val);               \
1638}
1639#define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1640
1641/* Description : Indexed halfword element values are replicated to all
1642                 elements in output vector
1643   Arguments   : Inputs  - in, idx0, idx1
1644                 Outputs - out0, out1
1645                 Return Type - as per RTYPE
1646   Details     : 'idx0' element value from 'in' vector is replicated to all
1647                  elements in 'out0' vector
1648                  Valid index range for halfword operation is 0-7
1649*/
1650#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1651{                                                     \
1652    out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1653    out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1654}
1655#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1656#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1657
1658#define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,        \
1659                  out0, out1, out2)                   \
1660{                                                     \
1661    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1662    out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2);  \
1663}
1664#define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1665#define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1666
1667#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1668                  out0, out1, out2, out3)             \
1669{                                                     \
1670    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1671    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1672}
1673#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1674#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1675
1676/* Description : Indexed word element values are replicated to all
1677                 elements in output vector
1678   Arguments   : Inputs  - in, stidx
1679                 Outputs - out0, out1
1680                 Return Type - as per RTYPE
1681   Details     : 'stidx' element value from 'in' vector is replicated to all
1682                  elements in 'out0' vector
1683                 'stidx + 1' element value from 'in' vector is replicated to all
1684                  elements in 'out1' vector
1685                  Valid index range for halfword operation is 0-3
1686*/
1687#define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1688{                                                          \
1689    out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1690    out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1691}
1692#define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1693#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1694
1695#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1696{                                                     \
1697    SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1698    SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1699}
1700#define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1701#define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1702
1703/* Description : Pack even byte elements of vector pairs
1704   Arguments   : Inputs  - in0, in1, in2, in3
1705                 Outputs - out0, out1
1706                 Return Type - as per RTYPE
1707   Details     : Even byte elements of in0 are copied to the left half of
1708                 out0 & even byte elements of in1 are copied to the right
1709                 half of out0.
1710                 Even byte elements of in2 are copied to the left half of
1711                 out1 & even byte elements of in3 are copied to the right
1712                 half of out1.
1713*/
1714#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1715{                                                            \
1716    out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1717    out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1718}
1719#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1720#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1721#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1722#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1723
1724#define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1725{                                                                        \
1726    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1727    out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1728}
1729#define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1730#define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1731
1732#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1733                 out0, out1, out2, out3)                         \
1734{                                                                \
1735    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1736    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1737}
1738#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1739#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1740#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1741#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1742
1743/* Description : Pack even halfword elements of vector pairs
1744   Arguments   : Inputs  - in0, in1, in2, in3
1745                 Outputs - out0, out1
1746                 Return Type - as per RTYPE
1747   Details     : Even halfword elements of in0 are copied to the left half of
1748                 out0 & even halfword elements of in1 are copied to the right
1749                 half of out0.
1750                 Even halfword elements of in2 are copied to the left half of
1751                 out1 & even halfword elements of in3 are copied to the right
1752                 half of out1.
1753*/
1754#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1755{                                                            \
1756    out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1757    out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1758}
1759#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1760#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1761
1762#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1763                 out0, out1, out2, out3)                         \
1764{                                                                \
1765    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1766    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1767}
1768#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1769#define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1770
1771/* Description : Pack even double word elements of vector pairs
1772   Arguments   : Inputs  - in0, in1, in2, in3
1773                 Outputs - out0, out1
1774                 Return Type - as per RTYPE
1775   Details     : Even double elements of in0 are copied to the left half of
1776                 out0 & even double elements of in1 are copied to the right
1777                 half of out0.
1778                 Even double elements of in2 are copied to the left half of
1779                 out1 & even double elements of in3 are copied to the right
1780                 half of out1.
1781*/
1782#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1783{                                                            \
1784    out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
1785    out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
1786}
1787#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1788#define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1789#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1790
1791#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1792                 out0, out1, out2, out3)                         \
1793{                                                                \
1794    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1795    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1796}
1797#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1798
1799/* Description : Pack odd double word elements of vector pairs
1800   Arguments   : Inputs  - in0, in1
1801                 Outputs - out0, out1
1802                 Return Type - as per RTYPE
1803   Details     : As operation is on same input 'in0' vector, index 1 double word
1804                 element is overwritten to index 0 and result is written to out0
1805                 As operation is on same input 'in1' vector, index 1 double word
1806                 element is overwritten to index 0 and result is written to out1
1807*/
1808#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1809{                                                            \
1810    out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
1811    out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3);  \
1812}
1813#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1814#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1815#define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1816
1817/* Description : Each byte element is logically xor'ed with immediate 128
1818   Arguments   : Inputs  - in0, in1
1819                 Outputs - in0, in1 (in-place)
1820                 Return Type - as per RTYPE
1821   Details     : Each unsigned byte element from input vector 'in0' is
1822                 logically xor'ed with 128 and result is in-place stored in
1823                 'in0' vector
1824                 Each unsigned byte element from input vector 'in1' is
1825                 logically xor'ed with 128 and result is in-place stored in
1826                 'in1' vector
1827                 Similar for other pairs
1828*/
1829#define XORI_B2_128(RTYPE, in0, in1)               \
1830{                                                  \
1831    in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1832    in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1833}
1834#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1835#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1836#define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1837
1838#define XORI_B3_128(RTYPE, in0, in1, in2)          \
1839{                                                  \
1840    XORI_B2_128(RTYPE, in0, in1);                  \
1841    in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1842}
1843#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1844
1845#define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1846{                                               \
1847    XORI_B2_128(RTYPE, in0, in1);               \
1848    XORI_B2_128(RTYPE, in2, in3);               \
1849}
1850#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1851#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1852#define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1853
1854#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1855{                                                    \
1856    XORI_B3_128(RTYPE, in0, in1, in2);               \
1857    XORI_B2_128(RTYPE, in3, in4);                    \
1858}
1859#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1860
1861#define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
1862{                                                         \
1863    XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
1864    XORI_B2_128(RTYPE, in4, in5);                         \
1865}
1866#define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1867
1868#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1869{                                                              \
1870    XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1871    XORI_B3_128(RTYPE, in4, in5, in6);                         \
1872}
1873#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1874
1875#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1876{                                                                   \
1877    XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1878    XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1879}
1880#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1881#define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1882
1883/* Description : Addition of signed halfword elements and signed saturation
1884   Arguments   : Inputs  - in0, in1, in2, in3
1885                 Outputs - out0, out1
1886                 Return Type - as per RTYPE
1887   Details     : Signed halfword elements from 'in0' are added to signed
1888                 halfword elements of 'in1'. The result is then signed saturated
1889                 between -32768 to +32767 (as per halfword data type)
1890                 Similar for other pairs
1891*/
1892#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
1893{                                                             \
1894    out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
1895    out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
1896}
1897#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1898
1899#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1900                 out0, out1, out2, out3)                         \
1901{                                                                \
1902    ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1903    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1904}
1905#define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1906#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1907
1908/* Description : Shift left all elements of vector (generic for all data types)
1909   Arguments   : Inputs  - in0, in1, in2, in3, shift
1910                 Outputs - in0, in1, in2, in3 (in place)
1911                 Return Type - as per input vector RTYPE
1912   Details     : Each element of vector 'in0' is left shifted by 'shift' and
1913                 result is in place written to 'in0'
1914                 Similar for other pairs
1915*/
1916#define SLLI_2V(in0, in1, shift)  \
1917{                                 \
1918    in0 = in0 << shift;           \
1919    in1 = in1 << shift;           \
1920}
1921#define SLLI_4V(in0, in1, in2, in3, shift)  \
1922{                                           \
1923    in0 = in0 << shift;                     \
1924    in1 = in1 << shift;                     \
1925    in2 = in2 << shift;                     \
1926    in3 = in3 << shift;                     \
1927}
1928
1929/* Description : Arithmetic shift right all elements of vector
1930                 (generic for all data types)
1931   Arguments   : Inputs  - in0, in1, in2, in3, shift
1932                 Outputs - in0, in1, in2, in3 (in place)
1933                 Return Type - as per input vector RTYPE
1934   Details     : Each element of vector 'in0' is right shifted by 'shift' and
1935                 result is in place written to 'in0'
1936                 Here, 'shift' is GP variable passed in
1937                 Similar for other pairs
1938*/
1939#define SRA_4V(in0, in1, in2, in3, shift)  \
1940{                                          \
1941    in0 = in0 >> shift;                    \
1942    in1 = in1 >> shift;                    \
1943    in2 = in2 >> shift;                    \
1944    in3 = in3 >> shift;                    \
1945}
1946
1947/* Description : Shift right logical all halfword elements of vector
1948   Arguments   : Inputs  - in0, in1, in2, in3, shift
1949                 Outputs - in0, in1, in2, in3 (in place)
1950                 Return Type - as per RTYPE
1951   Details     : Each element of vector 'in0' is shifted right logical by
1952                 number of bits respective element holds in vector 'shift' and
1953                 result is in place written to 'in0'
1954                 Here, 'shift' is a vector passed in
1955                 Similar for other pairs
1956*/
1957#define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
1958{                                                           \
1959    in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
1960    in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
1961    in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
1962    in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
1963}
1964#define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1965
1966#define SRLR_H4(RTYPE, in0, in1, in2, in3, shift)            \
1967{                                                            \
1968    in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift);  \
1969    in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift);  \
1970    in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift);  \
1971    in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift);  \
1972}
1973#define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1974#define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1975
1976#define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
1977{                                                                      \
1978    SRLR_H4(RTYPE, in0, in1, in2, in3, shift);                         \
1979    SRLR_H4(RTYPE, in4, in5, in6, in7, shift);                         \
1980}
1981#define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1982#define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1983
1984/* Description : Shift right arithmetic rounded halfwords
1985   Arguments   : Inputs  - in0, in1, shift
1986                 Outputs - in0, in1, (in place)
1987                 Return Type - as per RTYPE
1988   Details     : Each element of vector 'in0' is shifted right arithmetic by
1989                 number of bits respective element holds in vector 'shift'.
1990                 The last discarded bit is added to shifted value for rounding
1991                 and the result is in place written to 'in0'
1992                 Here, 'shift' is a vector passed in
1993                 Similar for other pairs
1994*/
1995#define SRAR_H2(RTYPE, in0, in1, shift)                      \
1996{                                                            \
1997    in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
1998    in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
1999}
2000#define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2001#define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2002
2003#define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
2004{                                                            \
2005    SRAR_H2(RTYPE, in0, in1, shift)                          \
2006    in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
2007}
2008#define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2009
2010#define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
2011{                                                  \
2012    SRAR_H2(RTYPE, in0, in1, shift)                \
2013    SRAR_H2(RTYPE, in2, in3, shift)                \
2014}
2015#define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2016#define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2017
2018/* Description : Shift right arithmetic rounded words
2019   Arguments   : Inputs  - in0, in1, shift
2020                 Outputs - in0, in1, (in place)
2021                 Return Type - as per RTYPE
2022   Details     : Each element of vector 'in0' is shifted right arithmetic by
2023                 number of bits respective element holds in vector 'shift'.
2024                 The last discarded bit is added to shifted value for rounding
2025                 and the result is in place written to 'in0'
2026                 Here, 'shift' is a vector passed in
2027                 Similar for other pairs
2028*/
2029#define SRAR_W2(RTYPE, in0, in1, shift)                      \
2030{                                                            \
2031    in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
2032    in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
2033}
2034#define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2035
2036#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
2037{                                                  \
2038    SRAR_W2(RTYPE, in0, in1, shift)                \
2039    SRAR_W2(RTYPE, in2, in3, shift)                \
2040}
2041#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2042
2043/* Description : Shift right arithmetic rounded (immediate)
2044   Arguments   : Inputs  - in0, in1, in2, in3, shift
2045                 Outputs - in0, in1, in2, in3 (in place)
2046                 Return Type - as per RTYPE
2047   Details     : Each element of vector 'in0' is shifted right arithmetic by
2048                 value in 'shift'.
2049                 The last discarded bit is added to shifted value for rounding
2050                 and the result is in place written to 'in0'
2051                 Similar for other pairs
2052*/
2053#define SRARI_H2(RTYPE, in0, in1, shift)              \
2054{                                                     \
2055    in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
2056    in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
2057}
2058#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2059#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2060
2061#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
2062{                                                     \
2063    SRARI_H2(RTYPE, in0, in1, shift);                 \
2064    SRARI_H2(RTYPE, in2, in3, shift);                 \
2065}
2066#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2067#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2068
2069/* Description : Shift right arithmetic rounded (immediate)
2070   Arguments   : Inputs  - in0, in1, shift
2071                 Outputs - in0, in1     (in place)
2072                 Return Type - as per RTYPE
2073   Details     : Each element of vector 'in0' is shifted right arithmetic by
2074                 value in 'shift'.
2075                 The last discarded bit is added to shifted value for rounding
2076                 and the result is in place written to 'in0'
2077                 Similar for other pairs
2078*/
2079#define SRARI_W2(RTYPE, in0, in1, shift)              \
2080{                                                     \
2081    in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
2082    in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
2083}
2084#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2085
2086#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
2087{                                                   \
2088    SRARI_W2(RTYPE, in0, in1, shift);               \
2089    SRARI_W2(RTYPE, in2, in3, shift);               \
2090}
2091#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2092#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2093
2094/* Description : Multiplication of pairs of vectors
2095   Arguments   : Inputs  - in0, in1, in2, in3
2096                 Outputs - out0, out1
2097   Details     : Each element from 'in0' is multiplied with elements from 'in1'
2098                 and result is written to 'out0'
2099                 Similar for other pairs
2100*/
2101#define MUL2(in0, in1, in2, in3, out0, out1)  \
2102{                                             \
2103    out0 = in0 * in1;                         \
2104    out1 = in2 * in3;                         \
2105}
2106#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2107{                                                                             \
2108    MUL2(in0, in1, in2, in3, out0, out1);                                     \
2109    MUL2(in4, in5, in6, in7, out2, out3);                                     \
2110}
2111
2112/* Description : Addition of 2 pairs of vectors
2113   Arguments   : Inputs  - in0, in1, in2, in3
2114                 Outputs - out0, out1
2115   Details     : Each element from 2 pairs vectors is added and 2 results are
2116                 produced
2117*/
2118#define ADD2(in0, in1, in2, in3, out0, out1)  \
2119{                                             \
2120    out0 = in0 + in1;                         \
2121    out1 = in2 + in3;                         \
2122}
2123#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2124{                                                                             \
2125    ADD2(in0, in1, in2, in3, out0, out1);                                     \
2126    ADD2(in4, in5, in6, in7, out2, out3);                                     \
2127}
2128
2129/* Description : Subtraction of 2 pairs of vectors
2130   Arguments   : Inputs  - in0, in1, in2, in3
2131                 Outputs - out0, out1
2132   Details     : Each element from 2 pairs vectors is subtracted and 2 results
2133                 are produced
2134*/
2135#define SUB2(in0, in1, in2, in3, out0, out1)  \
2136{                                             \
2137    out0 = in0 - in1;                         \
2138    out1 = in2 - in3;                         \
2139}
2140#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2141{                                                                             \
2142    out0 = in0 - in1;                                                         \
2143    out1 = in2 - in3;                                                         \
2144    out2 = in4 - in5;                                                         \
2145    out3 = in6 - in7;                                                         \
2146}
2147
2148/* Description : Sign extend byte elements from right half of the vector
2149   Arguments   : Input  - in    (byte vector)
2150                 Output - out   (sign extended halfword vector)
2151                 Return Type - signed halfword
2152   Details     : Sign bit of byte elements from input vector 'in' is
2153                 extracted and interleaved with same vector 'in' to generate
2154                 8 halfword elements keeping sign intact
2155*/
2156#define UNPCK_R_SB_SH(in, out)                       \
2157{                                                    \
2158    v16i8 sign_m;                                    \
2159                                                     \
2160    sign_m = __msa_clti_s_b((v16i8) in, 0);          \
2161    out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in);  \
2162}
2163
2164/* Description : Sign extend halfword elements from right half of the vector
2165   Arguments   : Inputs  - in    (input halfword vector)
2166                 Outputs - out   (sign extended word vectors)
2167                 Return Type - signed word
2168   Details     : Sign bit of halfword elements from input vector 'in' is
2169                 extracted and interleaved with same vector 'in0' to generate
2170                 4 word elements keeping sign intact
2171*/
2172#define UNPCK_R_SH_SW(in, out)                       \
2173{                                                    \
2174    v8i16 sign_m;                                    \
2175                                                     \
2176    sign_m = __msa_clti_s_h((v8i16) in, 0);          \
2177    out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
2178}
2179
2180/* Description : Sign extend byte elements from input vector and return
2181                 halfword results in pair of vectors
2182   Arguments   : Inputs  - in           (1 input byte vector)
2183                 Outputs - out0, out1   (sign extended 2 halfword vectors)
2184                 Return Type - signed halfword
2185   Details     : Sign bit of byte elements from input vector 'in' is
2186                 extracted and interleaved right with same vector 'in0' to
2187                 generate 8 signed halfword elements in 'out0'
2188                 Then interleaved left with same vector 'in0' to
2189                 generate 8 signed halfword elements in 'out1'
2190*/
2191#define UNPCK_SB_SH(in, out0, out1)                  \
2192{                                                    \
2193    v16i8 tmp_m;                                     \
2194                                                     \
2195    tmp_m = __msa_clti_s_b((v16i8) in, 0);           \
2196    ILVRL_B2_SH(tmp_m, in, out0, out1);              \
2197}
2198
2199/* Description : Zero extend unsigned byte elements to halfword elements
2200   Arguments   : Inputs  - in           (1 input unsigned byte vector)
2201                 Outputs - out0, out1   (unsigned 2 halfword vectors)
2202                 Return Type - signed halfword
2203   Details     : Zero extended right half of vector is returned in 'out0'
2204                 Zero extended left half of vector is returned in 'out1'
2205*/
2206#define UNPCK_UB_SH(in, out0, out1)                   \
2207{                                                     \
2208    v16i8 zero_m = { 0 };                             \
2209                                                      \
2210    ILVRL_B2_SH(zero_m, in, out0, out1);              \
2211}
2212
2213/* Description : Sign extend halfword elements from input vector and return
2214                 result in pair of vectors
2215   Arguments   : Inputs  - in           (1 input halfword vector)
2216                 Outputs - out0, out1   (sign extended 2 word vectors)
2217                 Return Type - signed word
2218   Details     : Sign bit of halfword elements from input vector 'in' is
2219                 extracted and interleaved right with same vector 'in0' to
2220                 generate 4 signed word elements in 'out0'
2221                 Then interleaved left with same vector 'in0' to
2222                 generate 4 signed word elements in 'out1'
2223*/
2224#define UNPCK_SH_SW(in, out0, out1)                  \
2225{                                                    \
2226    v8i16 tmp_m;                                     \
2227                                                     \
2228    tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
2229    ILVRL_H2_SW(tmp_m, in, out0, out1);              \
2230}
2231
2232/* Description : Swap two variables
2233   Arguments   : Inputs  - in0, in1
2234                 Outputs - in0, in1 (in-place)
2235   Details     : Swapping of two input variables using xor
2236*/
2237#define SWAP(in0, in1)  \
2238{                       \
2239    in0 = in0 ^ in1;    \
2240    in1 = in0 ^ in1;    \
2241    in0 = in0 ^ in1;    \
2242}
2243
2244/* Description : Butterfly of 4 input vectors
2245   Arguments   : Inputs  - in0, in1, in2, in3
2246                 Outputs - out0, out1, out2, out3
2247   Details     : Butterfly operation
2248*/
2249#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
2250{                                                                \
2251    out0 = in0 + in3;                                            \
2252    out1 = in1 + in2;                                            \
2253                                                                 \
2254    out2 = in1 - in2;                                            \
2255    out3 = in0 - in3;                                            \
2256}
2257
2258/* Description : Butterfly of 8 input vectors
2259   Arguments   : Inputs  - in0 ...  in7
2260                 Outputs - out0 .. out7
2261   Details     : Butterfly operation
2262*/
2263#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \
2264                    out0, out1, out2, out3, out4, out5, out6, out7)  \
2265{                                                                    \
2266    out0 = in0 + in7;                                                \
2267    out1 = in1 + in6;                                                \
2268    out2 = in2 + in5;                                                \
2269    out3 = in3 + in4;                                                \
2270                                                                     \
2271    out4 = in3 - in4;                                                \
2272    out5 = in2 - in5;                                                \
2273    out6 = in1 - in6;                                                \
2274    out7 = in0 - in7;                                                \
2275}
2276
2277/* Description : Butterfly of 16 input vectors
2278   Arguments   : Inputs  - in0 ...  in15
2279                 Outputs - out0 .. out15
2280   Details     : Butterfly operation
2281*/
2282#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                \
2283                     in8, in9,  in10, in11, in12, in13, in14, in15,         \
2284                     out0, out1, out2, out3, out4, out5, out6, out7,        \
2285                     out8, out9, out10, out11, out12, out13, out14, out15)  \
2286{                                                                           \
2287    out0 = in0 + in15;                                                      \
2288    out1 = in1 + in14;                                                      \
2289    out2 = in2 + in13;                                                      \
2290    out3 = in3 + in12;                                                      \
2291    out4 = in4 + in11;                                                      \
2292    out5 = in5 + in10;                                                      \
2293    out6 = in6 + in9;                                                       \
2294    out7 = in7 + in8;                                                       \
2295                                                                            \
2296    out8 = in7 - in8;                                                       \
2297    out9 = in6 - in9;                                                       \
2298    out10 = in5 - in10;                                                     \
2299    out11 = in4 - in11;                                                     \
2300    out12 = in3 - in12;                                                     \
2301    out13 = in2 - in13;                                                     \
2302    out14 = in1 - in14;                                                     \
2303    out15 = in0 - in15;                                                     \
2304}
2305
2306/* Description : Transposes input 4x4 byte block
2307   Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
2308                 Outputs - out0, out1, out2, out3  (output 4x4 byte block)
2309                 Return Type - unsigned byte
2310   Details     :
2311*/
2312#define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
2313{                                                                       \
2314    v16i8 zero_m = { 0 };                                               \
2315    v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
2316                                                                        \
2317    ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
2318    ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
2319                                                                        \
2320    out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
2321    out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
2322    out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
2323    out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
2324}
2325
2326/* Description : Transposes input 8x4 byte block into 4x8
2327   Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
2328                 Outputs - out0, out1, out2, out3  (output 4x8 byte block)
2329                 Return Type - as per RTYPE
2330   Details     :
2331*/
2332#define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2333                        out0, out1, out2, out3)                         \
2334{                                                                       \
2335    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2336                                                                        \
2337    ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
2338    tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2339    ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
2340                                                                        \
2341    tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2342    ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
2343                                                                        \
2344    ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
2345    out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
2346    out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2347}
2348#define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2349#define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2350
2351/* Description : Transposes input 8x8 byte block
2352   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2353                           (input 8x8 byte block)
2354                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2355                           (output 8x8 byte block)
2356                 Return Type - as per RTYPE
2357   Details     :
2358*/
2359#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2360                        out0, out1, out2, out3, out4, out5, out6, out7)  \
2361{                                                                        \
2362    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
2363    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
2364    v16i8 zeros = { 0 };                                                 \
2365                                                                         \
2366    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
2367               tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
2368    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
2369    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
2370    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
2371    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
2372    SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6,   \
2373            8, out1, out3, out5, out7);                                  \
2374}
2375#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2376#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2377
2378/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2379   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2380                           in8, in9, in10, in11, in12, in13, in14, in15
2381                 Outputs - out0, out1, out2, out3
2382                 Return Type - unsigned byte
2383   Details     :
2384*/
2385#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
2386                            in8, in9, in10, in11, in12, in13, in14, in15,  \
2387                            out0, out1, out2, out3)                        \
2388{                                                                          \
2389    v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
2390                                                                           \
2391    ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
2392    out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2393                                                                           \
2394    ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
2395    out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2396                                                                           \
2397    ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
2398                                                                           \
2399    tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2400    ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
2401                                                                           \
2402    tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2403    ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
2404    out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2405    out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2406                                                                           \
2407    tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \
2408    tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \
2409    out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2410    out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2411}
2412
2413/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2414   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2415                           in8, in9, in10, in11, in12, in13, in14, in15
2416                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2417                 Return Type - unsigned byte
2418   Details     :
2419*/
2420#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
2421                            in8, in9, in10, in11, in12, in13, in14, in15,    \
2422                            out0, out1, out2, out3, out4, out5, out6, out7)  \
2423{                                                                            \
2424    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2425    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
2426                                                                             \
2427    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
2428    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
2429    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
2430    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
2431                                                                             \
2432    tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
2433    tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
2434    tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
2435    tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
2436    out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
2437    tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
2438    out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
2439    tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
2440                                                                             \
2441    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
2442    out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2443    out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2444                                                                             \
2445    tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2446    tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
2447    out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2448    out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2449                                                                             \
2450    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
2451    out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2452    out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2453                                                                             \
2454    tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2455    tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2456    out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2457    out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2458}
2459
2460/* Description : Transposes 4x4 block with half word elements in vectors
2461   Arguments   : Inputs  - in0, in1, in2, in3
2462                 Outputs - out0, out1, out2, out3
2463                 Return Type - signed halfword
2464   Details     :
2465*/
2466#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
2467{                                                                       \
2468    v8i16 s0_m, s1_m;                                                   \
2469                                                                        \
2470    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
2471    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
2472    out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \
2473    out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2474}
2475
2476/* Description : Transposes 8x8 block with half word elements in vectors
2477   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2478                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2479                 Return Type - as per RTYPE
2480   Details     :
2481*/
2482#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2483                       out0, out1, out2, out3, out4, out5, out6, out7)  \
2484{                                                                       \
2485    v8i16 s0_m, s1_m;                                                   \
2486    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2487    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                               \
2488                                                                        \
2489    ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2490    ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                            \
2491    ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2492    ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                            \
2493    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2494    ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                            \
2495    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2496    ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                            \
2497    PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,     \
2498             tmp3_m, tmp7_m, out0, out2, out4, out6);                   \
2499    out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m);       \
2500    out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m);       \
2501    out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m);       \
2502    out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m);       \
2503}
2504#define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2505#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2506
2507/* Description : Transposes 4x4 block with word elements in vectors
2508   Arguments   : Inputs  - in0, in1, in2, in3
2509                 Outputs - out0, out1, out2, out3
2510                 Return Type - signed word
2511   Details     :
2512*/
2513#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
2514{                                                                       \
2515    v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
2516                                                                        \
2517    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
2518    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
2519                                                                        \
2520    out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);            \
2521    out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);            \
2522    out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);            \
2523    out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
2524}
2525
2526/* Description : Average byte elements from pair of vectors and store 8x4 byte
2527                 block in destination memory
2528   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2529   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2530                 averaged (a + b)/2 and stored in 'tmp0_m'
2531                 Each byte element from input vector pair 'in2' and 'in3' are
2532                 averaged (a + b)/2 and stored in 'tmp1_m'
2533                 Each byte element from input vector pair 'in4' and 'in5' are
2534                 averaged (a + b)/2 and stored in 'tmp2_m'
2535                 Each byte element from input vector pair 'in6' and 'in7' are
2536                 averaged (a + b)/2 and stored in 'tmp3_m'
2537                 The half vector results from all 4 vectors are stored in
2538                 destination memory as 8x4 byte block
2539*/
2540#define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2541{                                                                           \
2542    uint64_t out0_m, out1_m, out2_m, out3_m;                                \
2543    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2544                                                                            \
2545    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
2546    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
2547    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
2548    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
2549                                                                            \
2550    out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
2551    out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
2552    out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
2553    out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
2554    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
2555}
2556
2557/* Description : Average byte elements from pair of vectors and store 16x4 byte
2558                 block in destination memory
2559   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2560   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2561                 averaged (a + b)/2 and stored in 'tmp0_m'
2562                 Each byte element from input vector pair 'in2' and 'in3' are
2563                 averaged (a + b)/2 and stored in 'tmp1_m'
2564                 Each byte element from input vector pair 'in4' and 'in5' are
2565                 averaged (a + b)/2 and stored in 'tmp2_m'
2566                 Each byte element from input vector pair 'in6' and 'in7' are
2567                 averaged (a + b)/2 and stored in 'tmp3_m'
2568                 The results from all 4 vectors are stored in destination
2569                 memory as 16x4 byte block
2570*/
2571#define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2572{                                                                            \
2573    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2574                                                                             \
2575    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
2576    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
2577    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
2578    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
2579                                                                             \
2580    ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
2581}
2582
2583/* Description : Average rounded byte elements from pair of vectors and store
2584                 8x4 byte block in destination memory
2585   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2586   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2587                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2588                 Each byte element from input vector pair 'in2' and 'in3' are
2589                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2590                 Each byte element from input vector pair 'in4' and 'in5' are
2591                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2592                 Each byte element from input vector pair 'in6' and 'in7' are
2593                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2594                 The half vector results from all 4 vectors are stored in
2595                 destination memory as 8x4 byte block
2596*/
2597#define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2598{                                                                            \
2599    uint64_t out0_m, out1_m, out2_m, out3_m;                                 \
2600    v16u8 tp0_m, tp1_m, tp2_m, tp3_m;                                        \
2601                                                                             \
2602    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                      \
2603                tp0_m, tp1_m, tp2_m, tp3_m);                                 \
2604                                                                             \
2605    out0_m = __msa_copy_u_d((v2i64) tp0_m, 0);                               \
2606    out1_m = __msa_copy_u_d((v2i64) tp1_m, 0);                               \
2607    out2_m = __msa_copy_u_d((v2i64) tp2_m, 0);                               \
2608    out3_m = __msa_copy_u_d((v2i64) tp3_m, 0);                               \
2609    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
2610}
2611
2612/* Description : Average rounded byte elements from pair of vectors and store
2613                 16x4 byte block in destination memory
2614   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2615   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2616                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2617                 Each byte element from input vector pair 'in2' and 'in3' are
2618                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2619                 Each byte element from input vector pair 'in4' and 'in5' are
2620                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2621                 Each byte element from input vector pair 'in6' and 'in7' are
2622                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2623                 The vector results from all 4 vectors are stored in
2624                 destination memory as 16x4 byte block
2625*/
2626#define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2627{                                                                             \
2628    v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
2629                                                                              \
2630    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
2631                t0_m, t1_m, t2_m, t3_m);                                      \
2632    ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
2633}
2634
2635/* Description : Average rounded byte elements from pair of vectors,
2636                 average rounded with destination and store 8x4 byte block
2637                 in destination memory
2638   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2639   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2640                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2641                 Each byte element from input vector pair 'in2' and 'in3' are
2642                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2643                 Each byte element from input vector pair 'in4' and 'in5' are
2644                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2645                 Each byte element from input vector pair 'in6' and 'in7' are
2646                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2647                 The half vector results from all 4 vectors are stored in
2648                 destination memory as 8x4 byte block
2649*/
2650#define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2651                          pdst, stride)                            \
2652{                                                                  \
2653    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
2654    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
2655                                                                   \
2656    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
2657    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
2658                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
2659    AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2660                  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2661}
2662
2663/* Description : Average rounded byte elements from pair of vectors,
2664                 average rounded with destination and store 16x4 byte block
2665                 in destination memory
2666   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2667   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2668                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2669                 Each byte element from input vector pair 'in2' and 'in3' are
2670                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2671                 Each byte element from input vector pair 'in4' and 'in5' are
2672                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2673                 Each byte element from input vector pair 'in6' and 'in7' are
2674                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2675                 The vector results from all 4 vectors are stored in
2676                 destination memory as 16x4 byte block
2677*/
2678#define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2679                           pdst, stride)                            \
2680{                                                                   \
2681    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
2682    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
2683                                                                    \
2684    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
2685    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
2686                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
2687    AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2688                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2689}
2690
2691/* Description : Add block 4x4
2692   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2693   Details     : Least significant 4 bytes from each input vector are added to
2694                 the destination bytes, clipped between 0-255 and then stored.
2695*/
2696#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \
2697{                                                                 \
2698    uint32_t src0_m, src1_m, src2_m, src3_m;                      \
2699    uint32_t out0_m, out1_m, out2_m, out3_m;                      \
2700    v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
2701    v16i8 dst0_m = { 0 };                                         \
2702    v16i8 dst1_m = { 0 };                                         \
2703    v16i8 zero_m = { 0 };                                         \
2704                                                                  \
2705    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
2706    LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
2707    INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
2708    INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
2709    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
2710    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
2711    CLIP_SH2_0_255(res0_m, res1_m);                               \
2712    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
2713                                                                  \
2714    out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \
2715    out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \
2716    out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \
2717    out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \
2718    SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
2719}
2720
2721/* Description : Dot product and addition of 3 signed halfword input vectors
2722   Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
2723                 Outputs - out0_m
2724                 Return Type - signed halfword
2725   Details     : Dot product of 'in0' with 'coeff0'
2726                 Dot product of 'in1' with 'coeff1'
2727                 Dot product of 'in2' with 'coeff2'
2728                 Addition of all the 3 vector results
2729
2730                 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2731*/
2732#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
2733( {                                                                 \
2734    v8i16 out0_m;                                                   \
2735                                                                    \
2736    out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
2737    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
2738    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
2739                                                                    \
2740    out0_m;                                                         \
2741} )
2742
2743/* Description : Pack even elements of input vectors & xor with 128
2744   Arguments   : Inputs  - in0, in1
2745                 Outputs - out_m
2746                 Return Type - unsigned byte
2747   Details     : Signed byte even elements from 'in0' and 'in1' are packed
2748                 together in one vector and the resulted vector is xor'ed with
2749                 128 to shift the range from signed to unsigned byte
2750*/
2751#define PCKEV_XORI128_UB(in0, in1)                            \
2752( {                                                           \
2753    v16u8 out_m;                                              \
2754    out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2755    out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
2756    out_m;                                                    \
2757} )
2758
2759/* Description : Converts inputs to unsigned bytes, interleave, average & store
2760                 as 8x4 unsigned byte block
2761   Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
2762*/
2763#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,           \
2764                                dst0, dst1, pdst, stride)     \
2765{                                                             \
2766    v16u8 tmp0_m, tmp1_m;                                     \
2767    uint8_t *pdst_m = (uint8_t *) (pdst);                     \
2768                                                              \
2769    tmp0_m = PCKEV_XORI128_UB(in0, in1);                      \
2770    tmp1_m = PCKEV_XORI128_UB(in2, in3);                      \
2771    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
2772    ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);        \
2773}
2774
2775/* Description : Pack even byte elements, extract 0 & 2 index words from pair
2776                 of results and store 4 words in destination memory as per
2777                 stride
2778   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2779*/
2780#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
2781{                                                         \
2782    uint32_t out0_m, out1_m, out2_m, out3_m;              \
2783    v16i8 tmp0_m, tmp1_m;                                 \
2784                                                          \
2785    PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
2786                                                          \
2787    out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
2788    out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
2789    out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
2790    out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
2791                                                          \
2792    SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
2793}
2794
2795/* Description : Pack even byte elements and store byte vector in destination
2796                 memory
2797   Arguments   : Inputs  - in0, in1, pdst
2798*/
2799#define PCKEV_ST_SB(in0, in1, pdst)                   \
2800{                                                     \
2801    v16i8 tmp_m;                                      \
2802    tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2803    ST_SB(tmp_m, (pdst));                             \
2804}
2805
2806/* Description : Horizontal 2 tap filter kernel code
2807   Arguments   : Inputs  - in0, in1, mask, coeff, shift
2808*/
2809#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)            \
2810( {                                                                 \
2811    v16i8 tmp0_m;                                                   \
2812    v8u16 tmp1_m;                                                   \
2813                                                                    \
2814    tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0);  \
2815    tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff);         \
2816    tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift);          \
2817    tmp1_m = __msa_sat_u_h(tmp1_m, shift);                          \
2818                                                                    \
2819    tmp1_m;                                                         \
2820} )
2821#endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */
2822