1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22cabdff1aSopenharmony_ci#define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci#include <stdint.h>
25cabdff1aSopenharmony_ci#include <msa.h>
26cabdff1aSopenharmony_ci#include <config.h>
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_ci#define ALIGNMENT           16
29cabdff1aSopenharmony_ci#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ci#define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
32cabdff1aSopenharmony_ci#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
33cabdff1aSopenharmony_ci#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
34cabdff1aSopenharmony_ci#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
35cabdff1aSopenharmony_ci#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
36cabdff1aSopenharmony_ci#define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
37cabdff1aSopenharmony_ci#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40cabdff1aSopenharmony_ci#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
41cabdff1aSopenharmony_ci#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
42cabdff1aSopenharmony_ci#define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
43cabdff1aSopenharmony_ci#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
44cabdff1aSopenharmony_ci#define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
45cabdff1aSopenharmony_ci#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci#if (__mips_isa_rev >= 6)
48cabdff1aSopenharmony_ci    #define LH(psrc)                              \
49cabdff1aSopenharmony_ci    ( {                                           \
50cabdff1aSopenharmony_ci        uint16_t val_lh_m = *(uint16_t *)(psrc);  \
51cabdff1aSopenharmony_ci        val_lh_m;                                 \
52cabdff1aSopenharmony_ci    } )
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci    #define LW(psrc)                              \
55cabdff1aSopenharmony_ci    ( {                                           \
56cabdff1aSopenharmony_ci        uint32_t val_lw_m = *(uint32_t *)(psrc);  \
57cabdff1aSopenharmony_ci        val_lw_m;                                 \
58cabdff1aSopenharmony_ci    } )
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_ci    #if (__mips == 64)
61cabdff1aSopenharmony_ci        #define LD(psrc)                               \
62cabdff1aSopenharmony_ci        ( {                                            \
63cabdff1aSopenharmony_ci            uint64_t val_ld_m =  *(uint64_t *)(psrc);  \
64cabdff1aSopenharmony_ci            val_ld_m;                                  \
65cabdff1aSopenharmony_ci        } )
66cabdff1aSopenharmony_ci    #else  // !(__mips == 64)
67cabdff1aSopenharmony_ci        #define LD(psrc)                                                    \
68cabdff1aSopenharmony_ci        ( {                                                                 \
69cabdff1aSopenharmony_ci            uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
70cabdff1aSopenharmony_ci            uint32_t val0_ld_m, val1_ld_m;                                  \
71cabdff1aSopenharmony_ci            uint64_t val_ld_m = 0;                                          \
72cabdff1aSopenharmony_ci                                                                            \
73cabdff1aSopenharmony_ci            val0_ld_m = LW(psrc_ld_m);                                      \
74cabdff1aSopenharmony_ci            val1_ld_m = LW(psrc_ld_m + 4);                                  \
75cabdff1aSopenharmony_ci                                                                            \
76cabdff1aSopenharmony_ci            val_ld_m = (uint64_t) (val1_ld_m);                              \
77cabdff1aSopenharmony_ci            val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
78cabdff1aSopenharmony_ci            val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
79cabdff1aSopenharmony_ci                                                                            \
80cabdff1aSopenharmony_ci            val_ld_m;                                                       \
81cabdff1aSopenharmony_ci        } )
82cabdff1aSopenharmony_ci    #endif  // (__mips == 64)
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_ci    #define SH(val, pdst)  *(uint16_t *)(pdst) = (val);
85cabdff1aSopenharmony_ci    #define SW(val, pdst)  *(uint32_t *)(pdst) = (val);
86cabdff1aSopenharmony_ci    #define SD(val, pdst)  *(uint64_t *)(pdst) = (val);
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci#else  // !(__mips_isa_rev >= 6)
89cabdff1aSopenharmony_ci    #define LH(psrc)                                 \
90cabdff1aSopenharmony_ci    ( {                                              \
91cabdff1aSopenharmony_ci        uint8_t *psrc_lh_m = (uint8_t *) (psrc);     \
92cabdff1aSopenharmony_ci        uint16_t val_lh_m;                           \
93cabdff1aSopenharmony_ci                                                     \
94cabdff1aSopenharmony_ci        __asm__ volatile (                           \
95cabdff1aSopenharmony_ci            "ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t"  \
96cabdff1aSopenharmony_ci                                                     \
97cabdff1aSopenharmony_ci            : [val_lh_m] "=r" (val_lh_m)             \
98cabdff1aSopenharmony_ci            : [psrc_lh_m] "m" (*psrc_lh_m)           \
99cabdff1aSopenharmony_ci        );                                           \
100cabdff1aSopenharmony_ci                                                     \
101cabdff1aSopenharmony_ci        val_lh_m;                                    \
102cabdff1aSopenharmony_ci    } )
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci    #define LW(psrc)                                 \
105cabdff1aSopenharmony_ci    ( {                                              \
106cabdff1aSopenharmony_ci        uint8_t *psrc_lw_m = (uint8_t *) (psrc);     \
107cabdff1aSopenharmony_ci        uint32_t val_lw_m;                           \
108cabdff1aSopenharmony_ci                                                     \
109cabdff1aSopenharmony_ci        __asm__ volatile (                           \
110cabdff1aSopenharmony_ci            "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"  \
111cabdff1aSopenharmony_ci            "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"  \
112cabdff1aSopenharmony_ci                                                     \
113cabdff1aSopenharmony_ci            : [val_lw_m] "=&r"(val_lw_m)             \
114cabdff1aSopenharmony_ci            : [psrc_lw_m] "r"(psrc_lw_m)             \
115cabdff1aSopenharmony_ci        );                                           \
116cabdff1aSopenharmony_ci                                                     \
117cabdff1aSopenharmony_ci        val_lw_m;                                    \
118cabdff1aSopenharmony_ci    } )
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci    #if (__mips == 64)
121cabdff1aSopenharmony_ci        #define LD(psrc)                                 \
122cabdff1aSopenharmony_ci        ( {                                              \
123cabdff1aSopenharmony_ci            uint8_t *psrc_ld_m = (uint8_t *) (psrc);     \
124cabdff1aSopenharmony_ci            uint64_t val_ld_m = 0;                       \
125cabdff1aSopenharmony_ci                                                         \
126cabdff1aSopenharmony_ci            __asm__ volatile (                           \
127cabdff1aSopenharmony_ci                "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"  \
128cabdff1aSopenharmony_ci                "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"  \
129cabdff1aSopenharmony_ci                                                         \
130cabdff1aSopenharmony_ci                : [val_ld_m] "=&r" (val_ld_m)            \
131cabdff1aSopenharmony_ci                : [psrc_ld_m] "r" (psrc_ld_m)            \
132cabdff1aSopenharmony_ci            );                                           \
133cabdff1aSopenharmony_ci                                                         \
134cabdff1aSopenharmony_ci            val_ld_m;                                    \
135cabdff1aSopenharmony_ci        } )
136cabdff1aSopenharmony_ci    #else  // !(__mips == 64)
137cabdff1aSopenharmony_ci        #define LD(psrc)                                                    \
138cabdff1aSopenharmony_ci        ( {                                                                 \
139cabdff1aSopenharmony_ci            uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
140cabdff1aSopenharmony_ci            uint32_t val0_ld_m, val1_ld_m;                                  \
141cabdff1aSopenharmony_ci            uint64_t val_ld_m = 0;                                          \
142cabdff1aSopenharmony_ci                                                                            \
143cabdff1aSopenharmony_ci            val0_ld_m = LW(psrc_ld_m);                                      \
144cabdff1aSopenharmony_ci            val1_ld_m = LW(psrc_ld_m + 4);                                  \
145cabdff1aSopenharmony_ci                                                                            \
146cabdff1aSopenharmony_ci            val_ld_m = (uint64_t) (val1_ld_m);                              \
147cabdff1aSopenharmony_ci            val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
148cabdff1aSopenharmony_ci            val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
149cabdff1aSopenharmony_ci                                                                            \
150cabdff1aSopenharmony_ci            val_ld_m;                                                       \
151cabdff1aSopenharmony_ci        } )
152cabdff1aSopenharmony_ci    #endif  // (__mips == 64)
153cabdff1aSopenharmony_ci
154cabdff1aSopenharmony_ci    #define SH(val, pdst)                            \
155cabdff1aSopenharmony_ci    {                                                \
156cabdff1aSopenharmony_ci        uint8_t *pdst_sh_m = (uint8_t *) (pdst);     \
157cabdff1aSopenharmony_ci        uint16_t val_sh_m = (val);                   \
158cabdff1aSopenharmony_ci                                                     \
159cabdff1aSopenharmony_ci        __asm__ volatile (                           \
160cabdff1aSopenharmony_ci            "ush  %[val_sh_m],  %[pdst_sh_m]  \n\t"  \
161cabdff1aSopenharmony_ci                                                     \
162cabdff1aSopenharmony_ci            : [pdst_sh_m] "=m" (*pdst_sh_m)          \
163cabdff1aSopenharmony_ci            : [val_sh_m] "r" (val_sh_m)              \
164cabdff1aSopenharmony_ci        );                                           \
165cabdff1aSopenharmony_ci    }
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci    #define SW(val, pdst)                            \
168cabdff1aSopenharmony_ci    {                                                \
169cabdff1aSopenharmony_ci        uint8_t *pdst_sw_m = (uint8_t *) (pdst);     \
170cabdff1aSopenharmony_ci        uint32_t val_sw_m = (val);                   \
171cabdff1aSopenharmony_ci                                                     \
172cabdff1aSopenharmony_ci        __asm__ volatile (                           \
173cabdff1aSopenharmony_ci            "usw  %[val_sw_m],  %[pdst_sw_m]  \n\t"  \
174cabdff1aSopenharmony_ci                                                     \
175cabdff1aSopenharmony_ci            : [pdst_sw_m] "=m" (*pdst_sw_m)          \
176cabdff1aSopenharmony_ci            : [val_sw_m] "r" (val_sw_m)              \
177cabdff1aSopenharmony_ci        );                                           \
178cabdff1aSopenharmony_ci    }
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci    #define SD(val, pdst)                                             \
181cabdff1aSopenharmony_ci    {                                                                 \
182cabdff1aSopenharmony_ci        uint8_t *pdst_sd_m = (uint8_t *) (pdst);                      \
183cabdff1aSopenharmony_ci        uint32_t val0_sd_m, val1_sd_m;                                \
184cabdff1aSopenharmony_ci                                                                      \
185cabdff1aSopenharmony_ci        val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
186cabdff1aSopenharmony_ci        val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
187cabdff1aSopenharmony_ci                                                                      \
188cabdff1aSopenharmony_ci        SW(val0_sd_m, pdst_sd_m);                                     \
189cabdff1aSopenharmony_ci        SW(val1_sd_m, pdst_sd_m + 4);                                 \
190cabdff1aSopenharmony_ci    }
191cabdff1aSopenharmony_ci#endif // (__mips_isa_rev >= 6)
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci/* Description : Load 4 words with stride
194cabdff1aSopenharmony_ci   Arguments   : Inputs  - psrc    (source pointer to load from)
195cabdff1aSopenharmony_ci                         - stride
196cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3
197cabdff1aSopenharmony_ci   Details     : Loads word in 'out0' from (psrc)
198cabdff1aSopenharmony_ci                 Loads word in 'out1' from (psrc + stride)
199cabdff1aSopenharmony_ci                 Loads word in 'out2' from (psrc + 2 * stride)
200cabdff1aSopenharmony_ci                 Loads word in 'out3' from (psrc + 3 * stride)
201cabdff1aSopenharmony_ci*/
202cabdff1aSopenharmony_ci#define LW4(psrc, stride, out0, out1, out2, out3)  \
203cabdff1aSopenharmony_ci{                                                  \
204cabdff1aSopenharmony_ci    out0 = LW((psrc));                             \
205cabdff1aSopenharmony_ci    out1 = LW((psrc) + stride);                    \
206cabdff1aSopenharmony_ci    out2 = LW((psrc) + 2 * stride);                \
207cabdff1aSopenharmony_ci    out3 = LW((psrc) + 3 * stride);                \
208cabdff1aSopenharmony_ci}
209cabdff1aSopenharmony_ci
210cabdff1aSopenharmony_ci#define LW2(psrc, stride, out0, out1)  \
211cabdff1aSopenharmony_ci{                                      \
212cabdff1aSopenharmony_ci    out0 = LW((psrc));                 \
213cabdff1aSopenharmony_ci    out1 = LW((psrc) + stride);        \
214cabdff1aSopenharmony_ci}
215cabdff1aSopenharmony_ci
216cabdff1aSopenharmony_ci/* Description : Load double words with stride
217cabdff1aSopenharmony_ci   Arguments   : Inputs  - psrc    (source pointer to load from)
218cabdff1aSopenharmony_ci                         - stride
219cabdff1aSopenharmony_ci                 Outputs - out0, out1
220cabdff1aSopenharmony_ci   Details     : Loads double word in 'out0' from (psrc)
221cabdff1aSopenharmony_ci                 Loads double word in 'out1' from (psrc + stride)
222cabdff1aSopenharmony_ci*/
223cabdff1aSopenharmony_ci#define LD2(psrc, stride, out0, out1)  \
224cabdff1aSopenharmony_ci{                                      \
225cabdff1aSopenharmony_ci    out0 = LD((psrc));                 \
226cabdff1aSopenharmony_ci    out1 = LD((psrc) + stride);        \
227cabdff1aSopenharmony_ci}
228cabdff1aSopenharmony_ci#define LD4(psrc, stride, out0, out1, out2, out3)  \
229cabdff1aSopenharmony_ci{                                                  \
230cabdff1aSopenharmony_ci    LD2((psrc), stride, out0, out1);               \
231cabdff1aSopenharmony_ci    LD2((psrc) + 2 * stride, stride, out2, out3);  \
232cabdff1aSopenharmony_ci}
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci/* Description : Store 4 words with stride
235cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
236cabdff1aSopenharmony_ci   Details     : Stores word from 'in0' to (pdst)
237cabdff1aSopenharmony_ci                 Stores word from 'in1' to (pdst + stride)
238cabdff1aSopenharmony_ci                 Stores word from 'in2' to (pdst + 2 * stride)
239cabdff1aSopenharmony_ci                 Stores word from 'in3' to (pdst + 3 * stride)
240cabdff1aSopenharmony_ci*/
241cabdff1aSopenharmony_ci#define SW4(in0, in1, in2, in3, pdst, stride)  \
242cabdff1aSopenharmony_ci{                                              \
243cabdff1aSopenharmony_ci    SW(in0, (pdst))                            \
244cabdff1aSopenharmony_ci    SW(in1, (pdst) + stride);                  \
245cabdff1aSopenharmony_ci    SW(in2, (pdst) + 2 * stride);              \
246cabdff1aSopenharmony_ci    SW(in3, (pdst) + 3 * stride);              \
247cabdff1aSopenharmony_ci}
248cabdff1aSopenharmony_ci
249cabdff1aSopenharmony_ci/* Description : Store 4 double words with stride
250cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
251cabdff1aSopenharmony_ci   Details     : Stores double word from 'in0' to (pdst)
252cabdff1aSopenharmony_ci                 Stores double word from 'in1' to (pdst + stride)
253cabdff1aSopenharmony_ci                 Stores double word from 'in2' to (pdst + 2 * stride)
254cabdff1aSopenharmony_ci                 Stores double word from 'in3' to (pdst + 3 * stride)
255cabdff1aSopenharmony_ci*/
256cabdff1aSopenharmony_ci#define SD4(in0, in1, in2, in3, pdst, stride)  \
257cabdff1aSopenharmony_ci{                                              \
258cabdff1aSopenharmony_ci    SD(in0, (pdst))                            \
259cabdff1aSopenharmony_ci    SD(in1, (pdst) + stride);                  \
260cabdff1aSopenharmony_ci    SD(in2, (pdst) + 2 * stride);              \
261cabdff1aSopenharmony_ci    SD(in3, (pdst) + 3 * stride);              \
262cabdff1aSopenharmony_ci}
263cabdff1aSopenharmony_ci
264cabdff1aSopenharmony_ci/* Description : Load vector elements with stride
265cabdff1aSopenharmony_ci   Arguments   : Inputs  - psrc    (source pointer to load from)
266cabdff1aSopenharmony_ci                         - stride
267cabdff1aSopenharmony_ci                 Outputs - out0, out1
268cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
269cabdff1aSopenharmony_ci   Details     : Loads elements in 'out0' from (psrc)
270cabdff1aSopenharmony_ci                 Loads elements in 'out1' from (psrc + stride)
271cabdff1aSopenharmony_ci*/
272cabdff1aSopenharmony_ci#define LD_V2(RTYPE, psrc, stride, out0, out1)  \
273cabdff1aSopenharmony_ci{                                               \
274cabdff1aSopenharmony_ci    out0 = LD_V(RTYPE, (psrc));                 \
275cabdff1aSopenharmony_ci    out1 = LD_V(RTYPE, (psrc) + stride);        \
276cabdff1aSopenharmony_ci}
277cabdff1aSopenharmony_ci#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
278cabdff1aSopenharmony_ci#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
279cabdff1aSopenharmony_ci#define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
280cabdff1aSopenharmony_ci#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
281cabdff1aSopenharmony_ci#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
282cabdff1aSopenharmony_ci
283cabdff1aSopenharmony_ci#define LD_V3(RTYPE, psrc, stride, out0, out1, out2)  \
284cabdff1aSopenharmony_ci{                                                     \
285cabdff1aSopenharmony_ci    LD_V2(RTYPE, (psrc), stride, out0, out1);         \
286cabdff1aSopenharmony_ci    out2 = LD_V(RTYPE, (psrc) + 2 * stride);          \
287cabdff1aSopenharmony_ci}
288cabdff1aSopenharmony_ci#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
289cabdff1aSopenharmony_ci#define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
290cabdff1aSopenharmony_ci
291cabdff1aSopenharmony_ci#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
292cabdff1aSopenharmony_ci{                                                            \
293cabdff1aSopenharmony_ci    LD_V2(RTYPE, (psrc), stride, out0, out1);                \
294cabdff1aSopenharmony_ci    LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
295cabdff1aSopenharmony_ci}
296cabdff1aSopenharmony_ci#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
297cabdff1aSopenharmony_ci#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
298cabdff1aSopenharmony_ci#define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
299cabdff1aSopenharmony_ci#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
300cabdff1aSopenharmony_ci#define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_ci#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
303cabdff1aSopenharmony_ci{                                                                 \
304cabdff1aSopenharmony_ci    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
305cabdff1aSopenharmony_ci    out4 = LD_V(RTYPE, (psrc) + 4 * stride);                      \
306cabdff1aSopenharmony_ci}
307cabdff1aSopenharmony_ci#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
308cabdff1aSopenharmony_ci#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
309cabdff1aSopenharmony_ci
310cabdff1aSopenharmony_ci#define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
311cabdff1aSopenharmony_ci{                                                                       \
312cabdff1aSopenharmony_ci    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
313cabdff1aSopenharmony_ci    LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
314cabdff1aSopenharmony_ci}
315cabdff1aSopenharmony_ci#define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
316cabdff1aSopenharmony_ci#define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
317cabdff1aSopenharmony_ci#define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
318cabdff1aSopenharmony_ci#define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
319cabdff1aSopenharmony_ci
320cabdff1aSopenharmony_ci#define LD_V7(RTYPE, psrc, stride,                               \
321cabdff1aSopenharmony_ci              out0, out1, out2, out3, out4, out5, out6)          \
322cabdff1aSopenharmony_ci{                                                                \
323cabdff1aSopenharmony_ci    LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
324cabdff1aSopenharmony_ci    LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
325cabdff1aSopenharmony_ci}
326cabdff1aSopenharmony_ci#define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
327cabdff1aSopenharmony_ci#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci#define LD_V8(RTYPE, psrc, stride,                                      \
330cabdff1aSopenharmony_ci              out0, out1, out2, out3, out4, out5, out6, out7)           \
331cabdff1aSopenharmony_ci{                                                                       \
332cabdff1aSopenharmony_ci    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
333cabdff1aSopenharmony_ci    LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
334cabdff1aSopenharmony_ci}
335cabdff1aSopenharmony_ci#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
336cabdff1aSopenharmony_ci#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
337cabdff1aSopenharmony_ci#define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
338cabdff1aSopenharmony_ci#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
339cabdff1aSopenharmony_ci#define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_ci#define LD_V16(RTYPE, psrc, stride,                                   \
342cabdff1aSopenharmony_ci               out0, out1, out2, out3, out4, out5, out6, out7,        \
343cabdff1aSopenharmony_ci               out8, out9, out10, out11, out12, out13, out14, out15)  \
344cabdff1aSopenharmony_ci{                                                                     \
345cabdff1aSopenharmony_ci    LD_V8(RTYPE, (psrc), stride,                                      \
346cabdff1aSopenharmony_ci          out0, out1, out2, out3, out4, out5, out6, out7);            \
347cabdff1aSopenharmony_ci    LD_V8(RTYPE, (psrc) + 8 * stride, stride,                         \
348cabdff1aSopenharmony_ci          out8, out9, out10, out11, out12, out13, out14, out15);      \
349cabdff1aSopenharmony_ci}
350cabdff1aSopenharmony_ci#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci/* Description : Store vectors with stride
353cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, stride
354cabdff1aSopenharmony_ci                 Outputs - pdst    (destination pointer to store to)
355cabdff1aSopenharmony_ci   Details     : Stores elements from 'in0' to (pdst)
356cabdff1aSopenharmony_ci                 Stores elements from 'in1' to (pdst + stride)
357cabdff1aSopenharmony_ci*/
358cabdff1aSopenharmony_ci#define ST_V2(RTYPE, in0, in1, pdst, stride)  \
359cabdff1aSopenharmony_ci{                                             \
360cabdff1aSopenharmony_ci    ST_V(RTYPE, in0, (pdst));                 \
361cabdff1aSopenharmony_ci    ST_V(RTYPE, in1, (pdst) + stride);        \
362cabdff1aSopenharmony_ci}
363cabdff1aSopenharmony_ci#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
364cabdff1aSopenharmony_ci#define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
365cabdff1aSopenharmony_ci#define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
366cabdff1aSopenharmony_ci#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
367cabdff1aSopenharmony_ci#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_ci#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
370cabdff1aSopenharmony_ci{                                                         \
371cabdff1aSopenharmony_ci    ST_V2(RTYPE, in0, in1, (pdst), stride);               \
372cabdff1aSopenharmony_ci    ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
373cabdff1aSopenharmony_ci}
374cabdff1aSopenharmony_ci#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
375cabdff1aSopenharmony_ci#define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
376cabdff1aSopenharmony_ci#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
377cabdff1aSopenharmony_ci#define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ci#define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
380cabdff1aSopenharmony_ci{                                                                 \
381cabdff1aSopenharmony_ci    ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
382cabdff1aSopenharmony_ci    ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
383cabdff1aSopenharmony_ci}
384cabdff1aSopenharmony_ci#define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
387cabdff1aSopenharmony_ci{                                                                           \
388cabdff1aSopenharmony_ci    ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
389cabdff1aSopenharmony_ci    ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
390cabdff1aSopenharmony_ci}
391cabdff1aSopenharmony_ci#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
392cabdff1aSopenharmony_ci#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
393cabdff1aSopenharmony_ci#define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci/* Description : Store half word elements of vector with stride
396cabdff1aSopenharmony_ci * Arguments   : Inputs  - in   source vector
397cabdff1aSopenharmony_ci *                       - pdst    (destination pointer to store to)
398cabdff1aSopenharmony_ci *                       - stride
399cabdff1aSopenharmony_ci * Details     : Stores half word 'idx0' from 'in' to (pdst)
400cabdff1aSopenharmony_ci *               Stores half word 'idx1' from 'in' to (pdst + stride)
401cabdff1aSopenharmony_ci *               Similar for other elements
402cabdff1aSopenharmony_ci */
403cabdff1aSopenharmony_ci#define ST_H1(in, idx, pdst)                             \
404cabdff1aSopenharmony_ci{                                                        \
405cabdff1aSopenharmony_ci    uint16_t out0_m;                                     \
406cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_h((v8i16) in, idx);            \
407cabdff1aSopenharmony_ci    SH(out0_m, (pdst));                                  \
408cabdff1aSopenharmony_ci}
409cabdff1aSopenharmony_ci#define ST_H2(in, idx0, idx1, pdst, stride)              \
410cabdff1aSopenharmony_ci{                                                        \
411cabdff1aSopenharmony_ci    uint16_t out0_m, out1_m;                             \
412cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
413cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
414cabdff1aSopenharmony_ci    SH(out0_m, (pdst));                                  \
415cabdff1aSopenharmony_ci    SH(out1_m, (pdst) + stride);                         \
416cabdff1aSopenharmony_ci}
417cabdff1aSopenharmony_ci#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
418cabdff1aSopenharmony_ci{                                                        \
419cabdff1aSopenharmony_ci    uint16_t out0_m, out1_m, out2_m, out3_m;             \
420cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
421cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
422cabdff1aSopenharmony_ci    out2_m = __msa_copy_u_h((v8i16) in, idx2);           \
423cabdff1aSopenharmony_ci    out3_m = __msa_copy_u_h((v8i16) in, idx3);           \
424cabdff1aSopenharmony_ci    SH(out0_m, (pdst));                                  \
425cabdff1aSopenharmony_ci    SH(out1_m, (pdst) + stride);                         \
426cabdff1aSopenharmony_ci    SH(out2_m, (pdst) + 2 * stride);                     \
427cabdff1aSopenharmony_ci    SH(out3_m, (pdst) + 3 * stride);                     \
428cabdff1aSopenharmony_ci}
429cabdff1aSopenharmony_ci#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,            \
430cabdff1aSopenharmony_ci              idx6, idx7, pdst, stride)                          \
431cabdff1aSopenharmony_ci{                                                                \
432cabdff1aSopenharmony_ci    ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)              \
433cabdff1aSopenharmony_ci    ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
434cabdff1aSopenharmony_ci}
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci/* Description : Store word elements of vector with stride
437cabdff1aSopenharmony_ci * Arguments   : Inputs  - in   source vector
438cabdff1aSopenharmony_ci *                       - pdst    (destination pointer to store to)
439cabdff1aSopenharmony_ci *                       - stride
440cabdff1aSopenharmony_ci * Details     : Stores word 'idx0' from 'in' to (pdst)
441cabdff1aSopenharmony_ci *               Stores word 'idx1' from 'in' to (pdst + stride)
442cabdff1aSopenharmony_ci *               Similar for other elements
443cabdff1aSopenharmony_ci */
444cabdff1aSopenharmony_ci#define ST_W1(in, idx, pdst)                             \
445cabdff1aSopenharmony_ci{                                                        \
446cabdff1aSopenharmony_ci    uint32_t out0_m;                                     \
447cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_w((v4i32) in, idx);            \
448cabdff1aSopenharmony_ci    SW(out0_m, (pdst));                                  \
449cabdff1aSopenharmony_ci}
450cabdff1aSopenharmony_ci#define ST_W2(in, idx0, idx1, pdst, stride)              \
451cabdff1aSopenharmony_ci{                                                        \
452cabdff1aSopenharmony_ci    uint32_t out0_m, out1_m;                             \
453cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
454cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
455cabdff1aSopenharmony_ci    SW(out0_m, (pdst));                                  \
456cabdff1aSopenharmony_ci    SW(out1_m, (pdst) + stride);                         \
457cabdff1aSopenharmony_ci}
458cabdff1aSopenharmony_ci#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
459cabdff1aSopenharmony_ci{                                                        \
460cabdff1aSopenharmony_ci    uint32_t out0_m, out1_m, out2_m, out3_m;             \
461cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
462cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
463cabdff1aSopenharmony_ci    out2_m = __msa_copy_u_w((v4i32) in, idx2);           \
464cabdff1aSopenharmony_ci    out3_m = __msa_copy_u_w((v4i32) in, idx3);           \
465cabdff1aSopenharmony_ci    SW(out0_m, (pdst));                                  \
466cabdff1aSopenharmony_ci    SW(out1_m, (pdst) + stride);                         \
467cabdff1aSopenharmony_ci    SW(out2_m, (pdst) + 2*stride);                       \
468cabdff1aSopenharmony_ci    SW(out3_m, (pdst) + 3*stride);                       \
469cabdff1aSopenharmony_ci}
470cabdff1aSopenharmony_ci#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,                 \
471cabdff1aSopenharmony_ci              idx4, idx5, idx6, idx7, pdst, stride)             \
472cabdff1aSopenharmony_ci{                                                               \
473cabdff1aSopenharmony_ci    ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride)            \
474cabdff1aSopenharmony_ci    ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
475cabdff1aSopenharmony_ci}
476cabdff1aSopenharmony_ci
477cabdff1aSopenharmony_ci/* Description : Store double word elements of vector with stride
478cabdff1aSopenharmony_ci * Arguments   : Inputs  - in   source vector
479cabdff1aSopenharmony_ci *                       - pdst    (destination pointer to store to)
480cabdff1aSopenharmony_ci *                       - stride
481cabdff1aSopenharmony_ci * Details     : Stores double word 'idx0' from 'in' to (pdst)
482cabdff1aSopenharmony_ci *               Stores double word 'idx1' from 'in' to (pdst + stride)
483cabdff1aSopenharmony_ci *               Similar for other elements
484cabdff1aSopenharmony_ci */
485cabdff1aSopenharmony_ci#define ST_D1(in, idx, pdst)                   \
486cabdff1aSopenharmony_ci{                                              \
487cabdff1aSopenharmony_ci    uint64_t out0_m;                           \
488cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_d((v2i64) in, idx);  \
489cabdff1aSopenharmony_ci    SD(out0_m, (pdst));                        \
490cabdff1aSopenharmony_ci}
491cabdff1aSopenharmony_ci#define ST_D2(in, idx0, idx1, pdst, stride)    \
492cabdff1aSopenharmony_ci{                                              \
493cabdff1aSopenharmony_ci    uint64_t out0_m, out1_m;                   \
494cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_d((v2i64) in, idx0); \
495cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_d((v2i64) in, idx1); \
496cabdff1aSopenharmony_ci    SD(out0_m, (pdst));                        \
497cabdff1aSopenharmony_ci    SD(out1_m, (pdst) + stride);               \
498cabdff1aSopenharmony_ci}
499cabdff1aSopenharmony_ci#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
500cabdff1aSopenharmony_ci{                                                             \
501cabdff1aSopenharmony_ci    uint64_t out0_m, out1_m, out2_m, out3_m;                  \
502cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_d((v2i64) in0, idx0);               \
503cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_d((v2i64) in0, idx1);               \
504cabdff1aSopenharmony_ci    out2_m = __msa_copy_u_d((v2i64) in1, idx2);               \
505cabdff1aSopenharmony_ci    out3_m = __msa_copy_u_d((v2i64) in1, idx3);               \
506cabdff1aSopenharmony_ci    SD(out0_m, (pdst));                                       \
507cabdff1aSopenharmony_ci    SD(out1_m, (pdst) + stride);                              \
508cabdff1aSopenharmony_ci    SD(out2_m, (pdst) + 2 * stride);                          \
509cabdff1aSopenharmony_ci    SD(out3_m, (pdst) + 3 * stride);                          \
510cabdff1aSopenharmony_ci}
511cabdff1aSopenharmony_ci#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,              \
512cabdff1aSopenharmony_ci              idx4, idx5, idx6, idx7, pdst, stride)                    \
513cabdff1aSopenharmony_ci{                                                                      \
514cabdff1aSopenharmony_ci    ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)              \
515cabdff1aSopenharmony_ci    ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
516cabdff1aSopenharmony_ci}
517cabdff1aSopenharmony_ci
518cabdff1aSopenharmony_ci/* Description : Store as 12x8 byte block to destination memory from
519cabdff1aSopenharmony_ci                 input vectors
520cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
521cabdff1aSopenharmony_ci   Details     : Index 0 double word element from input vector 'in0' is copied
522cabdff1aSopenharmony_ci                 and stored to destination memory at (pblk_12x8_m) followed by
523cabdff1aSopenharmony_ci                 index 2 word element from same input vector 'in0' at
524cabdff1aSopenharmony_ci                 (pblk_12x8_m + 8)
525cabdff1aSopenharmony_ci                 Similar to remaining lines
526cabdff1aSopenharmony_ci*/
527cabdff1aSopenharmony_ci#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
528cabdff1aSopenharmony_ci{                                                                        \
529cabdff1aSopenharmony_ci    uint64_t out0_m, out1_m, out2_m, out3_m;                             \
530cabdff1aSopenharmony_ci    uint64_t out4_m, out5_m, out6_m, out7_m;                             \
531cabdff1aSopenharmony_ci    uint32_t out8_m, out9_m, out10_m, out11_m;                           \
532cabdff1aSopenharmony_ci    uint32_t out12_m, out13_m, out14_m, out15_m;                         \
533cabdff1aSopenharmony_ci    uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
534cabdff1aSopenharmony_ci                                                                         \
535cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
536cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
537cabdff1aSopenharmony_ci    out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
538cabdff1aSopenharmony_ci    out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
539cabdff1aSopenharmony_ci    out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
540cabdff1aSopenharmony_ci    out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
541cabdff1aSopenharmony_ci    out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
542cabdff1aSopenharmony_ci    out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
543cabdff1aSopenharmony_ci                                                                         \
544cabdff1aSopenharmony_ci    out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
545cabdff1aSopenharmony_ci    out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
546cabdff1aSopenharmony_ci    out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
547cabdff1aSopenharmony_ci    out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
548cabdff1aSopenharmony_ci    out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
549cabdff1aSopenharmony_ci    out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
550cabdff1aSopenharmony_ci    out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
551cabdff1aSopenharmony_ci    out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
552cabdff1aSopenharmony_ci                                                                         \
553cabdff1aSopenharmony_ci    SD(out0_m, pblk_12x8_m);                                             \
554cabdff1aSopenharmony_ci    SW(out8_m, pblk_12x8_m + 8);                                         \
555cabdff1aSopenharmony_ci    pblk_12x8_m += stride;                                               \
556cabdff1aSopenharmony_ci    SD(out1_m, pblk_12x8_m);                                             \
557cabdff1aSopenharmony_ci    SW(out9_m, pblk_12x8_m + 8);                                         \
558cabdff1aSopenharmony_ci    pblk_12x8_m += stride;                                               \
559cabdff1aSopenharmony_ci    SD(out2_m, pblk_12x8_m);                                             \
560cabdff1aSopenharmony_ci    SW(out10_m, pblk_12x8_m + 8);                                        \
561cabdff1aSopenharmony_ci    pblk_12x8_m += stride;                                               \
562cabdff1aSopenharmony_ci    SD(out3_m, pblk_12x8_m);                                             \
563cabdff1aSopenharmony_ci    SW(out11_m, pblk_12x8_m + 8);                                        \
564cabdff1aSopenharmony_ci    pblk_12x8_m += stride;                                               \
565cabdff1aSopenharmony_ci    SD(out4_m, pblk_12x8_m);                                             \
566cabdff1aSopenharmony_ci    SW(out12_m, pblk_12x8_m + 8);                                        \
567cabdff1aSopenharmony_ci    pblk_12x8_m += stride;                                               \
568cabdff1aSopenharmony_ci    SD(out5_m, pblk_12x8_m);                                             \
569cabdff1aSopenharmony_ci    SW(out13_m, pblk_12x8_m + 8);                                        \
570cabdff1aSopenharmony_ci    pblk_12x8_m += stride;                                               \
571cabdff1aSopenharmony_ci    SD(out6_m, pblk_12x8_m);                                             \
572cabdff1aSopenharmony_ci    SW(out14_m, pblk_12x8_m + 8);                                        \
573cabdff1aSopenharmony_ci    pblk_12x8_m += stride;                                               \
574cabdff1aSopenharmony_ci    SD(out7_m, pblk_12x8_m);                                             \
575cabdff1aSopenharmony_ci    SW(out15_m, pblk_12x8_m + 8);                                        \
576cabdff1aSopenharmony_ci}
577cabdff1aSopenharmony_ci
578cabdff1aSopenharmony_ci/* Description : average with rounding (in0 + in1 + 1) / 2.
579cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3,
580cabdff1aSopenharmony_ci                 Outputs - out0, out1
581cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
582cabdff1aSopenharmony_ci   Details     : Each byte element from 'in0' vector is added with each byte
583cabdff1aSopenharmony_ci                 element from 'in1' vector. The addition of the elements plus 1
584cabdff1aSopenharmony_ci                (for rounding) is done unsigned with full precision,
585cabdff1aSopenharmony_ci                i.e. the result has one extra bit. Unsigned division by 2
586cabdff1aSopenharmony_ci                (or logical shift right by one bit) is performed before writing
587cabdff1aSopenharmony_ci                the result to vector 'out0'
588cabdff1aSopenharmony_ci                Similar for the pair of 'in2' and 'in3'
589cabdff1aSopenharmony_ci*/
590cabdff1aSopenharmony_ci#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)       \
591cabdff1aSopenharmony_ci{                                                             \
592cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1);  \
593cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3);  \
594cabdff1aSopenharmony_ci}
595cabdff1aSopenharmony_ci#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
596cabdff1aSopenharmony_ci
597cabdff1aSopenharmony_ci#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
598cabdff1aSopenharmony_ci                 out0, out1, out2, out3)                        \
599cabdff1aSopenharmony_ci{                                                               \
600cabdff1aSopenharmony_ci    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)             \
601cabdff1aSopenharmony_ci    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)             \
602cabdff1aSopenharmony_ci}
603cabdff1aSopenharmony_ci#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
604cabdff1aSopenharmony_ci
605cabdff1aSopenharmony_ci/* Description : Immediate number of columns to slide
606cabdff1aSopenharmony_ci   Arguments   : Inputs  - s, d, slide_val
607cabdff1aSopenharmony_ci                 Outputs - out
608cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
609cabdff1aSopenharmony_ci   Details     : Byte elements from 'd' vector are slide into 's' by
610cabdff1aSopenharmony_ci                 number of elements specified by 'slide_val'
611cabdff1aSopenharmony_ci*/
612cabdff1aSopenharmony_ci#define SLDI_B(RTYPE, d, s, slide_val, out)                       \
613cabdff1aSopenharmony_ci{                                                                 \
614cabdff1aSopenharmony_ci    out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val);  \
615cabdff1aSopenharmony_ci}
616cabdff1aSopenharmony_ci
617cabdff1aSopenharmony_ci#define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
618cabdff1aSopenharmony_ci{                                                              \
619cabdff1aSopenharmony_ci    SLDI_B(RTYPE, d0, s0, slide_val, out0)                     \
620cabdff1aSopenharmony_ci    SLDI_B(RTYPE, d1, s1, slide_val, out1)                     \
621cabdff1aSopenharmony_ci}
622cabdff1aSopenharmony_ci#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
623cabdff1aSopenharmony_ci#define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
624cabdff1aSopenharmony_ci#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
625cabdff1aSopenharmony_ci#define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
626cabdff1aSopenharmony_ci
627cabdff1aSopenharmony_ci#define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val,  \
628cabdff1aSopenharmony_ci                out0, out1, out2)                          \
629cabdff1aSopenharmony_ci{                                                          \
630cabdff1aSopenharmony_ci    SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
631cabdff1aSopenharmony_ci    SLDI_B(RTYPE, d2, s2, slide_val, out2)                 \
632cabdff1aSopenharmony_ci}
633cabdff1aSopenharmony_ci#define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
634cabdff1aSopenharmony_ci#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
635cabdff1aSopenharmony_ci#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
636cabdff1aSopenharmony_ci
637cabdff1aSopenharmony_ci#define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3,     \
638cabdff1aSopenharmony_ci                slide_val, out0, out1, out2, out3)         \
639cabdff1aSopenharmony_ci{                                                          \
640cabdff1aSopenharmony_ci    SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
641cabdff1aSopenharmony_ci    SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3)  \
642cabdff1aSopenharmony_ci}
643cabdff1aSopenharmony_ci#define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
644cabdff1aSopenharmony_ci#define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
645cabdff1aSopenharmony_ci#define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
646cabdff1aSopenharmony_ci
647cabdff1aSopenharmony_ci/* Description : Shuffle byte vector elements as per mask vector
648cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
649cabdff1aSopenharmony_ci                 Outputs - out0, out1
650cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
651cabdff1aSopenharmony_ci   Details     : Selective byte elements from in0 & in1 are copied to out0 as
652cabdff1aSopenharmony_ci                 per control vector mask0
653cabdff1aSopenharmony_ci                 Selective byte elements from in2 & in3 are copied to out1 as
654cabdff1aSopenharmony_ci                 per control vector mask1
655cabdff1aSopenharmony_ci*/
656cabdff1aSopenharmony_ci#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
657cabdff1aSopenharmony_ci{                                                                          \
658cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
659cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
660cabdff1aSopenharmony_ci}
661cabdff1aSopenharmony_ci#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
662cabdff1aSopenharmony_ci#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
663cabdff1aSopenharmony_ci#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
664cabdff1aSopenharmony_ci#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
665cabdff1aSopenharmony_ci
666cabdff1aSopenharmony_ci#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
667cabdff1aSopenharmony_ci                out0, out1, out2)                                          \
668cabdff1aSopenharmony_ci{                                                                          \
669cabdff1aSopenharmony_ci    VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
670cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
671cabdff1aSopenharmony_ci}
672cabdff1aSopenharmony_ci#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
673cabdff1aSopenharmony_ci
674cabdff1aSopenharmony_ci#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
675cabdff1aSopenharmony_ci                out0, out1, out2, out3)                            \
676cabdff1aSopenharmony_ci{                                                                  \
677cabdff1aSopenharmony_ci    VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
678cabdff1aSopenharmony_ci    VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
679cabdff1aSopenharmony_ci}
680cabdff1aSopenharmony_ci#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
681cabdff1aSopenharmony_ci#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
682cabdff1aSopenharmony_ci
683cabdff1aSopenharmony_ci/* Description : Shuffle halfword vector elements as per mask vector
684cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
685cabdff1aSopenharmony_ci                 Outputs - out0, out1
686cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
687cabdff1aSopenharmony_ci   Details     : Selective halfword elements from in0 & in1 are copied to out0
688cabdff1aSopenharmony_ci                 as per control vector mask0
689cabdff1aSopenharmony_ci                 Selective halfword elements from in2 & in3 are copied to out1
690cabdff1aSopenharmony_ci                 as per control vector mask1
691cabdff1aSopenharmony_ci*/
692cabdff1aSopenharmony_ci#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
693cabdff1aSopenharmony_ci{                                                                          \
694cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0);  \
695cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2);  \
696cabdff1aSopenharmony_ci}
697cabdff1aSopenharmony_ci#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
698cabdff1aSopenharmony_ci
699cabdff1aSopenharmony_ci#define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
700cabdff1aSopenharmony_ci                out0, out1, out2)                                          \
701cabdff1aSopenharmony_ci{                                                                          \
702cabdff1aSopenharmony_ci    VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
703cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4);  \
704cabdff1aSopenharmony_ci}
705cabdff1aSopenharmony_ci#define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_ci/* Description : Shuffle byte vector elements as per mask vector
708cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
709cabdff1aSopenharmony_ci                 Outputs - out0, out1
710cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
711cabdff1aSopenharmony_ci   Details     : Selective byte elements from in0 & in1 are copied to out0 as
712cabdff1aSopenharmony_ci                 per control vector mask0
713cabdff1aSopenharmony_ci                 Selective byte elements from in2 & in3 are copied to out1 as
714cabdff1aSopenharmony_ci                 per control vector mask1
715cabdff1aSopenharmony_ci*/
716cabdff1aSopenharmony_ci#define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
717cabdff1aSopenharmony_ci{                                                                         \
718cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
719cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
720cabdff1aSopenharmony_ci}
721cabdff1aSopenharmony_ci#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
722cabdff1aSopenharmony_ci
723cabdff1aSopenharmony_ci/* Description : Dot product of byte vector elements
724cabdff1aSopenharmony_ci   Arguments   : Inputs  - mult0, mult1
725cabdff1aSopenharmony_ci                           cnst0, cnst1
726cabdff1aSopenharmony_ci                 Outputs - out0, out1
727cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
728cabdff1aSopenharmony_ci   Details     : Unsigned byte elements from mult0 are multiplied with
729cabdff1aSopenharmony_ci                 unsigned byte elements from cnst0 producing a result
730cabdff1aSopenharmony_ci                 twice the size of input i.e. unsigned halfword.
731cabdff1aSopenharmony_ci                 Then this multiplication results of adjacent odd-even elements
732cabdff1aSopenharmony_ci                 are added together and stored to the out vector
733cabdff1aSopenharmony_ci                 (2 unsigned halfword results)
734cabdff1aSopenharmony_ci*/
735cabdff1aSopenharmony_ci#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
736cabdff1aSopenharmony_ci{                                                                 \
737cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0);  \
738cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1);  \
739cabdff1aSopenharmony_ci}
740cabdff1aSopenharmony_ci#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
741cabdff1aSopenharmony_ci
742cabdff1aSopenharmony_ci#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
743cabdff1aSopenharmony_ci                 cnst0, cnst1, cnst2, cnst3,                  \
744cabdff1aSopenharmony_ci                 out0, out1, out2, out3)                      \
745cabdff1aSopenharmony_ci{                                                             \
746cabdff1aSopenharmony_ci    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
747cabdff1aSopenharmony_ci    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
748cabdff1aSopenharmony_ci}
749cabdff1aSopenharmony_ci#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
750cabdff1aSopenharmony_ci
751cabdff1aSopenharmony_ci/* Description : Dot product of byte vector elements
752cabdff1aSopenharmony_ci   Arguments   : Inputs  - mult0, mult1
753cabdff1aSopenharmony_ci                           cnst0, cnst1
754cabdff1aSopenharmony_ci                 Outputs - out0, out1
755cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
756cabdff1aSopenharmony_ci   Details     : Signed byte elements from mult0 are multiplied with
757cabdff1aSopenharmony_ci                 signed byte elements from cnst0 producing a result
758cabdff1aSopenharmony_ci                 twice the size of input i.e. signed halfword.
759cabdff1aSopenharmony_ci                 Then this multiplication results of adjacent odd-even elements
760cabdff1aSopenharmony_ci                 are added together and stored to the out vector
761cabdff1aSopenharmony_ci                 (2 signed halfword results)
762cabdff1aSopenharmony_ci*/
763cabdff1aSopenharmony_ci#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
764cabdff1aSopenharmony_ci{                                                                 \
765cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
766cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
767cabdff1aSopenharmony_ci}
768cabdff1aSopenharmony_ci#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
769cabdff1aSopenharmony_ci
770cabdff1aSopenharmony_ci#define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
771cabdff1aSopenharmony_ci                 out0, out1, out2)                                 \
772cabdff1aSopenharmony_ci{                                                                  \
773cabdff1aSopenharmony_ci    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
774cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
775cabdff1aSopenharmony_ci}
776cabdff1aSopenharmony_ci#define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
777cabdff1aSopenharmony_ci
778cabdff1aSopenharmony_ci#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
779cabdff1aSopenharmony_ci                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
780cabdff1aSopenharmony_ci{                                                                     \
781cabdff1aSopenharmony_ci    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
782cabdff1aSopenharmony_ci    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
783cabdff1aSopenharmony_ci}
784cabdff1aSopenharmony_ci#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci/* Description : Dot product of halfword vector elements
787cabdff1aSopenharmony_ci   Arguments   : Inputs  - mult0, mult1
788cabdff1aSopenharmony_ci                           cnst0, cnst1
789cabdff1aSopenharmony_ci                 Outputs - out0, out1
790cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
791cabdff1aSopenharmony_ci   Details     : Signed halfword elements from mult0 are multiplied with
792cabdff1aSopenharmony_ci                 signed halfword elements from cnst0 producing a result
793cabdff1aSopenharmony_ci                 twice the size of input i.e. signed word.
794cabdff1aSopenharmony_ci                 Then this multiplication results of adjacent odd-even elements
795cabdff1aSopenharmony_ci                 are added together and stored to the out vector
796cabdff1aSopenharmony_ci                 (2 signed word results)
797cabdff1aSopenharmony_ci*/
798cabdff1aSopenharmony_ci#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
799cabdff1aSopenharmony_ci{                                                                 \
800cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
801cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
802cabdff1aSopenharmony_ci}
803cabdff1aSopenharmony_ci#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_ci#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
806cabdff1aSopenharmony_ci                 cnst0, cnst1, cnst2, cnst3,                  \
807cabdff1aSopenharmony_ci                 out0, out1, out2, out3)                      \
808cabdff1aSopenharmony_ci{                                                             \
809cabdff1aSopenharmony_ci    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
810cabdff1aSopenharmony_ci    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
811cabdff1aSopenharmony_ci}
812cabdff1aSopenharmony_ci#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_ci/* Description : Dot product & addition of byte vector elements
815cabdff1aSopenharmony_ci   Arguments   : Inputs  - mult0, mult1
816cabdff1aSopenharmony_ci                           cnst0, cnst1
817cabdff1aSopenharmony_ci                 Outputs - out0, out1
818cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
819cabdff1aSopenharmony_ci   Details     : Signed byte elements from mult0 are multiplied with
820cabdff1aSopenharmony_ci                 signed byte elements from cnst0 producing a result
821cabdff1aSopenharmony_ci                 twice the size of input i.e. signed halfword.
822cabdff1aSopenharmony_ci                 Then this multiplication results of adjacent odd-even elements
823cabdff1aSopenharmony_ci                 are added to the out vector
824cabdff1aSopenharmony_ci                 (2 signed halfword results)
825cabdff1aSopenharmony_ci*/
826cabdff1aSopenharmony_ci#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
827cabdff1aSopenharmony_ci{                                                                  \
828cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
829cabdff1aSopenharmony_ci                                   (v16i8) mult0, (v16i8) cnst0);  \
830cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
831cabdff1aSopenharmony_ci                                   (v16i8) mult1, (v16i8) cnst1);  \
832cabdff1aSopenharmony_ci}
833cabdff1aSopenharmony_ci#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
834cabdff1aSopenharmony_ci
835cabdff1aSopenharmony_ci#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
836cabdff1aSopenharmony_ci                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
837cabdff1aSopenharmony_ci{                                                                      \
838cabdff1aSopenharmony_ci    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
839cabdff1aSopenharmony_ci    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
840cabdff1aSopenharmony_ci}
841cabdff1aSopenharmony_ci#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
842cabdff1aSopenharmony_ci
843cabdff1aSopenharmony_ci/* Description : Dot product & addition of byte vector elements
844cabdff1aSopenharmony_ci   Arguments   : Inputs  - mult0, mult1
845cabdff1aSopenharmony_ci                           cnst0, cnst1
846cabdff1aSopenharmony_ci                 Outputs - out0, out1
847cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
848cabdff1aSopenharmony_ci   Details     : Unsigned byte elements from mult0 are multiplied with
849cabdff1aSopenharmony_ci                 unsigned byte elements from cnst0 producing a result
850cabdff1aSopenharmony_ci                 twice the size of input i.e. unsigned halfword.
851cabdff1aSopenharmony_ci                 Then this multiplication results of adjacent odd-even elements
852cabdff1aSopenharmony_ci                 are added to the out vector
853cabdff1aSopenharmony_ci                 (2 unsigned halfword results)
854cabdff1aSopenharmony_ci*/
855cabdff1aSopenharmony_ci#define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
856cabdff1aSopenharmony_ci{                                                                  \
857cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0,                   \
858cabdff1aSopenharmony_ci                                   (v16u8) mult0, (v16u8) cnst0);  \
859cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1,                   \
860cabdff1aSopenharmony_ci                                   (v16u8) mult1, (v16u8) cnst1);  \
861cabdff1aSopenharmony_ci}
862cabdff1aSopenharmony_ci#define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
863cabdff1aSopenharmony_ci
864cabdff1aSopenharmony_ci/* Description : Dot product & addition of halfword vector elements
865cabdff1aSopenharmony_ci   Arguments   : Inputs  - mult0, mult1
866cabdff1aSopenharmony_ci                           cnst0, cnst1
867cabdff1aSopenharmony_ci                 Outputs - out0, out1
868cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
869cabdff1aSopenharmony_ci   Details     : Signed halfword elements from mult0 are multiplied with
870cabdff1aSopenharmony_ci                 signed halfword elements from cnst0 producing a result
871cabdff1aSopenharmony_ci                 twice the size of input i.e. signed word.
872cabdff1aSopenharmony_ci                 Then this multiplication results of adjacent odd-even elements
873cabdff1aSopenharmony_ci                 are added to the out vector
874cabdff1aSopenharmony_ci                 (2 signed word results)
875cabdff1aSopenharmony_ci*/
876cabdff1aSopenharmony_ci#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
877cabdff1aSopenharmony_ci{                                                                  \
878cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
879cabdff1aSopenharmony_ci                                   (v8i16) mult0, (v8i16) cnst0);  \
880cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
881cabdff1aSopenharmony_ci                                   (v8i16) mult1, (v8i16) cnst1);  \
882cabdff1aSopenharmony_ci}
883cabdff1aSopenharmony_ci#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
884cabdff1aSopenharmony_ci
885cabdff1aSopenharmony_ci#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
886cabdff1aSopenharmony_ci                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
887cabdff1aSopenharmony_ci{                                                                      \
888cabdff1aSopenharmony_ci    DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
889cabdff1aSopenharmony_ci    DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
890cabdff1aSopenharmony_ci}
891cabdff1aSopenharmony_ci#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
892cabdff1aSopenharmony_ci
893cabdff1aSopenharmony_ci/* Description : Minimum values between unsigned elements of
894cabdff1aSopenharmony_ci                 either vector are copied to the output vector
895cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, min_vec
896cabdff1aSopenharmony_ci                 Outputs - in0, in1, (in place)
897cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
898cabdff1aSopenharmony_ci   Details     : Minimum of unsigned halfword element values from 'in0' and
899cabdff1aSopenharmony_ci                 'min_value' are written to output vector 'in0'
900cabdff1aSopenharmony_ci*/
901cabdff1aSopenharmony_ci#define MIN_UH2(RTYPE, in0, in1, min_vec)               \
902cabdff1aSopenharmony_ci{                                                       \
903cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec);  \
904cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec);  \
905cabdff1aSopenharmony_ci}
906cabdff1aSopenharmony_ci#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
907cabdff1aSopenharmony_ci
908cabdff1aSopenharmony_ci#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec)  \
909cabdff1aSopenharmony_ci{                                                    \
910cabdff1aSopenharmony_ci    MIN_UH2(RTYPE, in0, in1, min_vec);               \
911cabdff1aSopenharmony_ci    MIN_UH2(RTYPE, in2, in3, min_vec);               \
912cabdff1aSopenharmony_ci}
913cabdff1aSopenharmony_ci#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
914cabdff1aSopenharmony_ci
915cabdff1aSopenharmony_ci/* Description : Clips all halfword elements of input vector between min & max
916cabdff1aSopenharmony_ci                 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
917cabdff1aSopenharmony_ci   Arguments   : Inputs  - in    (input vector)
918cabdff1aSopenharmony_ci                         - min   (min threshold)
919cabdff1aSopenharmony_ci                         - max   (max threshold)
920cabdff1aSopenharmony_ci                 Outputs - in    (output vector with clipped elements)
921cabdff1aSopenharmony_ci                 Return Type - signed halfword
922cabdff1aSopenharmony_ci*/
923cabdff1aSopenharmony_ci#define CLIP_SH(in, min, max)                     \
924cabdff1aSopenharmony_ci{                                                 \
925cabdff1aSopenharmony_ci    in = __msa_max_s_h((v8i16) min, (v8i16) in);  \
926cabdff1aSopenharmony_ci    in = __msa_min_s_h((v8i16) max, (v8i16) in);  \
927cabdff1aSopenharmony_ci}
928cabdff1aSopenharmony_ci
929cabdff1aSopenharmony_ci/* Description : Clips all signed halfword elements of input vector
930cabdff1aSopenharmony_ci                 between 0 & 255
931cabdff1aSopenharmony_ci   Arguments   : Inputs  - in    (input vector)
932cabdff1aSopenharmony_ci                 Outputs - in    (output vector with clipped elements)
933cabdff1aSopenharmony_ci                 Return Type - signed halfwords
934cabdff1aSopenharmony_ci*/
935cabdff1aSopenharmony_ci#define CLIP_SH_0_255(in)                       \
936cabdff1aSopenharmony_ci{                                               \
937cabdff1aSopenharmony_ci    in = __msa_maxi_s_h((v8i16) in, 0);         \
938cabdff1aSopenharmony_ci    in = (v8i16) __msa_sat_u_h((v8u16) in, 7);  \
939cabdff1aSopenharmony_ci}
940cabdff1aSopenharmony_ci
941cabdff1aSopenharmony_ci#define CLIP_SH2_0_255(in0, in1)  \
942cabdff1aSopenharmony_ci{                                 \
943cabdff1aSopenharmony_ci    CLIP_SH_0_255(in0);           \
944cabdff1aSopenharmony_ci    CLIP_SH_0_255(in1);           \
945cabdff1aSopenharmony_ci}
946cabdff1aSopenharmony_ci
947cabdff1aSopenharmony_ci#define CLIP_SH4_0_255(in0, in1, in2, in3)  \
948cabdff1aSopenharmony_ci{                                           \
949cabdff1aSopenharmony_ci    CLIP_SH2_0_255(in0, in1);               \
950cabdff1aSopenharmony_ci    CLIP_SH2_0_255(in2, in3);               \
951cabdff1aSopenharmony_ci}
952cabdff1aSopenharmony_ci
953cabdff1aSopenharmony_ci#define CLIP_SH8_0_255(in0, in1, in2, in3,  \
954cabdff1aSopenharmony_ci                       in4, in5, in6, in7)  \
955cabdff1aSopenharmony_ci{                                           \
956cabdff1aSopenharmony_ci    CLIP_SH4_0_255(in0, in1, in2, in3);     \
957cabdff1aSopenharmony_ci    CLIP_SH4_0_255(in4, in5, in6, in7);     \
958cabdff1aSopenharmony_ci}
959cabdff1aSopenharmony_ci
960cabdff1aSopenharmony_ci/* Description : Clips all signed word elements of input vector
961cabdff1aSopenharmony_ci                 between 0 & 255
962cabdff1aSopenharmony_ci   Arguments   : Inputs  - in    (input vector)
963cabdff1aSopenharmony_ci                 Outputs - in    (output vector with clipped elements)
964cabdff1aSopenharmony_ci                 Return Type - signed word
965cabdff1aSopenharmony_ci*/
966cabdff1aSopenharmony_ci#define CLIP_SW_0_255(in)                       \
967cabdff1aSopenharmony_ci{                                               \
968cabdff1aSopenharmony_ci    in = __msa_maxi_s_w((v4i32) in, 0);         \
969cabdff1aSopenharmony_ci    in = (v4i32) __msa_sat_u_w((v4u32) in, 7);  \
970cabdff1aSopenharmony_ci}
971cabdff1aSopenharmony_ci
972cabdff1aSopenharmony_ci#define CLIP_SW2_0_255(in0, in1)  \
973cabdff1aSopenharmony_ci{                                 \
974cabdff1aSopenharmony_ci    CLIP_SW_0_255(in0);           \
975cabdff1aSopenharmony_ci    CLIP_SW_0_255(in1);           \
976cabdff1aSopenharmony_ci}
977cabdff1aSopenharmony_ci
978cabdff1aSopenharmony_ci#define CLIP_SW4_0_255(in0, in1, in2, in3)  \
979cabdff1aSopenharmony_ci{                                           \
980cabdff1aSopenharmony_ci    CLIP_SW2_0_255(in0, in1);               \
981cabdff1aSopenharmony_ci    CLIP_SW2_0_255(in2, in3);               \
982cabdff1aSopenharmony_ci}
983cabdff1aSopenharmony_ci
984cabdff1aSopenharmony_ci#define CLIP_SW8_0_255(in0, in1, in2, in3,  \
985cabdff1aSopenharmony_ci                       in4, in5, in6, in7)  \
986cabdff1aSopenharmony_ci{                                           \
987cabdff1aSopenharmony_ci    CLIP_SW4_0_255(in0, in1, in2, in3);     \
988cabdff1aSopenharmony_ci    CLIP_SW4_0_255(in4, in5, in6, in7);     \
989cabdff1aSopenharmony_ci}
990cabdff1aSopenharmony_ci
991cabdff1aSopenharmony_ci/* Description : Addition of 4 signed word elements
992cabdff1aSopenharmony_ci                 4 signed word elements of input vector are added together and
993cabdff1aSopenharmony_ci                 resulted integer sum is returned
994cabdff1aSopenharmony_ci   Arguments   : Inputs  - in       (signed word vector)
995cabdff1aSopenharmony_ci                 Outputs - sum_m    (i32 sum)
996cabdff1aSopenharmony_ci                 Return Type - signed word
997cabdff1aSopenharmony_ci*/
998cabdff1aSopenharmony_ci#define HADD_SW_S32(in)                               \
999cabdff1aSopenharmony_ci( {                                                   \
1000cabdff1aSopenharmony_ci    v2i64 res0_m, res1_m;                             \
1001cabdff1aSopenharmony_ci    int32_t sum_m;                                    \
1002cabdff1aSopenharmony_ci                                                      \
1003cabdff1aSopenharmony_ci    res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in);  \
1004cabdff1aSopenharmony_ci    res1_m = __msa_splati_d(res0_m, 1);               \
1005cabdff1aSopenharmony_ci    res0_m += res1_m;                                 \
1006cabdff1aSopenharmony_ci    sum_m = __msa_copy_s_w((v4i32) res0_m, 0);        \
1007cabdff1aSopenharmony_ci    sum_m;                                            \
1008cabdff1aSopenharmony_ci} )
1009cabdff1aSopenharmony_ci
1010cabdff1aSopenharmony_ci/* Description : Addition of 8 unsigned halfword elements
1011cabdff1aSopenharmony_ci                 8 unsigned halfword elements of input vector are added
1012cabdff1aSopenharmony_ci                 together and resulted integer sum is returned
1013cabdff1aSopenharmony_ci   Arguments   : Inputs  - in       (unsigned halfword vector)
1014cabdff1aSopenharmony_ci                 Outputs - sum_m    (u32 sum)
1015cabdff1aSopenharmony_ci                 Return Type - unsigned word
1016cabdff1aSopenharmony_ci*/
1017cabdff1aSopenharmony_ci#define HADD_UH_U32(in)                                  \
1018cabdff1aSopenharmony_ci( {                                                      \
1019cabdff1aSopenharmony_ci    v4u32 res_m;                                         \
1020cabdff1aSopenharmony_ci    v2u64 res0_m, res1_m;                                \
1021cabdff1aSopenharmony_ci    uint32_t sum_m;                                      \
1022cabdff1aSopenharmony_ci                                                         \
1023cabdff1aSopenharmony_ci    res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in);      \
1024cabdff1aSopenharmony_ci    res0_m = __msa_hadd_u_d(res_m, res_m);               \
1025cabdff1aSopenharmony_ci    res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1);  \
1026cabdff1aSopenharmony_ci    res0_m += res1_m;                                    \
1027cabdff1aSopenharmony_ci    sum_m = __msa_copy_u_w((v4i32) res0_m, 0);           \
1028cabdff1aSopenharmony_ci    sum_m;                                               \
1029cabdff1aSopenharmony_ci} )
1030cabdff1aSopenharmony_ci
1031cabdff1aSopenharmony_ci/* Description : Horizontal addition of signed byte vector elements
1032cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1
1033cabdff1aSopenharmony_ci                 Outputs - out0, out1
1034cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1035cabdff1aSopenharmony_ci   Details     : Each signed odd byte element from 'in0' is added to
1036cabdff1aSopenharmony_ci                 even signed byte element from 'in0' (pairwise) and the
1037cabdff1aSopenharmony_ci                 halfword result is stored in 'out0'
1038cabdff1aSopenharmony_ci*/
1039cabdff1aSopenharmony_ci#define HADD_SB2(RTYPE, in0, in1, out0, out1)                 \
1040cabdff1aSopenharmony_ci{                                                             \
1041cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0);  \
1042cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1);  \
1043cabdff1aSopenharmony_ci}
1044cabdff1aSopenharmony_ci#define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1045cabdff1aSopenharmony_ci
1046cabdff1aSopenharmony_ci#define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1047cabdff1aSopenharmony_ci{                                                                    \
1048cabdff1aSopenharmony_ci    HADD_SB2(RTYPE, in0, in1, out0, out1);                           \
1049cabdff1aSopenharmony_ci    HADD_SB2(RTYPE, in2, in3, out2, out3);                           \
1050cabdff1aSopenharmony_ci}
1051cabdff1aSopenharmony_ci#define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1052cabdff1aSopenharmony_ci#define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1053cabdff1aSopenharmony_ci
1054cabdff1aSopenharmony_ci/* Description : Horizontal addition of unsigned byte vector elements
1055cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1
1056cabdff1aSopenharmony_ci                 Outputs - out0, out1
1057cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1058cabdff1aSopenharmony_ci   Details     : Each unsigned odd byte element from 'in0' is added to
1059cabdff1aSopenharmony_ci                 even unsigned byte element from 'in0' (pairwise) and the
1060cabdff1aSopenharmony_ci                 halfword result is stored in 'out0'
1061cabdff1aSopenharmony_ci*/
1062cabdff1aSopenharmony_ci#define HADD_UB2(RTYPE, in0, in1, out0, out1)                 \
1063cabdff1aSopenharmony_ci{                                                             \
1064cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0);  \
1065cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1);  \
1066cabdff1aSopenharmony_ci}
1067cabdff1aSopenharmony_ci#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1068cabdff1aSopenharmony_ci
1069cabdff1aSopenharmony_ci#define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
1070cabdff1aSopenharmony_ci{                                                             \
1071cabdff1aSopenharmony_ci    HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
1072cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
1073cabdff1aSopenharmony_ci}
1074cabdff1aSopenharmony_ci#define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1075cabdff1aSopenharmony_ci
1076cabdff1aSopenharmony_ci#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1077cabdff1aSopenharmony_ci{                                                                    \
1078cabdff1aSopenharmony_ci    HADD_UB2(RTYPE, in0, in1, out0, out1);                           \
1079cabdff1aSopenharmony_ci    HADD_UB2(RTYPE, in2, in3, out2, out3);                           \
1080cabdff1aSopenharmony_ci}
1081cabdff1aSopenharmony_ci#define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1082cabdff1aSopenharmony_ci#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1083cabdff1aSopenharmony_ci#define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1084cabdff1aSopenharmony_ci
1085cabdff1aSopenharmony_ci/* Description : Horizontal subtraction of unsigned byte vector elements
1086cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1
1087cabdff1aSopenharmony_ci                 Outputs - out0, out1
1088cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1089cabdff1aSopenharmony_ci   Details     : Each unsigned odd byte element from 'in0' is subtracted from
1090cabdff1aSopenharmony_ci                 even unsigned byte element from 'in0' (pairwise) and the
1091cabdff1aSopenharmony_ci                 halfword result is stored in 'out0'
1092cabdff1aSopenharmony_ci*/
1093cabdff1aSopenharmony_ci#define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
1094cabdff1aSopenharmony_ci{                                                             \
1095cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
1096cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
1097cabdff1aSopenharmony_ci}
1098cabdff1aSopenharmony_ci#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1099cabdff1aSopenharmony_ci#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1100cabdff1aSopenharmony_ci
1101cabdff1aSopenharmony_ci#define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1102cabdff1aSopenharmony_ci{                                                                    \
1103cabdff1aSopenharmony_ci    HSUB_UB2(RTYPE, in0, in1, out0, out1);                           \
1104cabdff1aSopenharmony_ci    HSUB_UB2(RTYPE, in2, in3, out2, out3);                           \
1105cabdff1aSopenharmony_ci}
1106cabdff1aSopenharmony_ci#define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1107cabdff1aSopenharmony_ci#define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1108cabdff1aSopenharmony_ci
1109cabdff1aSopenharmony_ci/* Description : SAD (Sum of Absolute Difference)
1110cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, ref0, ref1  (unsigned byte src & ref)
1111cabdff1aSopenharmony_ci                 Outputs - sad_m                 (halfword vector with sad)
1112cabdff1aSopenharmony_ci                 Return Type - unsigned halfword
1113cabdff1aSopenharmony_ci   Details     : Absolute difference of all the byte elements from 'in0' with
1114cabdff1aSopenharmony_ci                 'ref0' is calculated and preserved in 'diff0'. From the 16
1115cabdff1aSopenharmony_ci                 unsigned absolute diff values, even-odd pairs are added
1116cabdff1aSopenharmony_ci                 together to generate 8 halfword results.
1117cabdff1aSopenharmony_ci*/
1118cabdff1aSopenharmony_ci#define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
1119cabdff1aSopenharmony_ci( {                                                             \
1120cabdff1aSopenharmony_ci    v16u8 diff0_m, diff1_m;                                     \
1121cabdff1aSopenharmony_ci    v8u16 sad_m = { 0 };                                        \
1122cabdff1aSopenharmony_ci                                                                \
1123cabdff1aSopenharmony_ci    diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \
1124cabdff1aSopenharmony_ci    diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \
1125cabdff1aSopenharmony_ci                                                                \
1126cabdff1aSopenharmony_ci    sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \
1127cabdff1aSopenharmony_ci    sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \
1128cabdff1aSopenharmony_ci                                                                \
1129cabdff1aSopenharmony_ci    sad_m;                                                      \
1130cabdff1aSopenharmony_ci} )
1131cabdff1aSopenharmony_ci
1132cabdff1aSopenharmony_ci/* Description : Insert specified word elements from input vectors to 1
1133cabdff1aSopenharmony_ci                 destination vector
1134cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
1135cabdff1aSopenharmony_ci                 Outputs - out                (output vector)
1136cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1137cabdff1aSopenharmony_ci*/
1138cabdff1aSopenharmony_ci#define INSERT_W2(RTYPE, in0, in1, out)                 \
1139cabdff1aSopenharmony_ci{                                                       \
1140cabdff1aSopenharmony_ci    out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1141cabdff1aSopenharmony_ci    out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1142cabdff1aSopenharmony_ci}
1143cabdff1aSopenharmony_ci#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1144cabdff1aSopenharmony_ci#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1145cabdff1aSopenharmony_ci
1146cabdff1aSopenharmony_ci#define INSERT_W4(RTYPE, in0, in1, in2, in3, out)       \
1147cabdff1aSopenharmony_ci{                                                       \
1148cabdff1aSopenharmony_ci    out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1149cabdff1aSopenharmony_ci    out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1150cabdff1aSopenharmony_ci    out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2);  \
1151cabdff1aSopenharmony_ci    out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3);  \
1152cabdff1aSopenharmony_ci}
1153cabdff1aSopenharmony_ci#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1154cabdff1aSopenharmony_ci#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1155cabdff1aSopenharmony_ci#define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1156cabdff1aSopenharmony_ci#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1157cabdff1aSopenharmony_ci
1158cabdff1aSopenharmony_ci/* Description : Insert specified double word elements from input vectors to 1
1159cabdff1aSopenharmony_ci                 destination vector
1160cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1      (2 input vectors)
1161cabdff1aSopenharmony_ci                 Outputs - out           (output vector)
1162cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1163cabdff1aSopenharmony_ci*/
1164cabdff1aSopenharmony_ci#define INSERT_D2(RTYPE, in0, in1, out)                 \
1165cabdff1aSopenharmony_ci{                                                       \
1166cabdff1aSopenharmony_ci    out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0);  \
1167cabdff1aSopenharmony_ci    out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1);  \
1168cabdff1aSopenharmony_ci}
1169cabdff1aSopenharmony_ci#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1170cabdff1aSopenharmony_ci#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1171cabdff1aSopenharmony_ci#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1172cabdff1aSopenharmony_ci#define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1173cabdff1aSopenharmony_ci
1174cabdff1aSopenharmony_ci/* Description : Interleave even byte elements from vectors
1175cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1176cabdff1aSopenharmony_ci                 Outputs - out0, out1
1177cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1178cabdff1aSopenharmony_ci   Details     : Even byte elements of 'in0' and even byte
1179cabdff1aSopenharmony_ci                 elements of 'in1' are interleaved and copied to 'out0'
1180cabdff1aSopenharmony_ci                 Even byte elements of 'in2' and even byte
1181cabdff1aSopenharmony_ci                 elements of 'in3' are interleaved and copied to 'out1'
1182cabdff1aSopenharmony_ci*/
1183cabdff1aSopenharmony_ci#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1184cabdff1aSopenharmony_ci{                                                            \
1185cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0);  \
1186cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2);  \
1187cabdff1aSopenharmony_ci}
1188cabdff1aSopenharmony_ci#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1189cabdff1aSopenharmony_ci#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1190cabdff1aSopenharmony_ci#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1191cabdff1aSopenharmony_ci#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1192cabdff1aSopenharmony_ci
1193cabdff1aSopenharmony_ci/* Description : Interleave even halfword elements from vectors
1194cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1195cabdff1aSopenharmony_ci                 Outputs - out0, out1
1196cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1197cabdff1aSopenharmony_ci   Details     : Even halfword elements of 'in0' and even halfword
1198cabdff1aSopenharmony_ci                 elements of 'in1' are interleaved and copied to 'out0'
1199cabdff1aSopenharmony_ci                 Even halfword elements of 'in2' and even halfword
1200cabdff1aSopenharmony_ci                 elements of 'in3' are interleaved and copied to 'out1'
1201cabdff1aSopenharmony_ci*/
1202cabdff1aSopenharmony_ci#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1203cabdff1aSopenharmony_ci{                                                            \
1204cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
1205cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
1206cabdff1aSopenharmony_ci}
1207cabdff1aSopenharmony_ci#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1208cabdff1aSopenharmony_ci#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1209cabdff1aSopenharmony_ci#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1210cabdff1aSopenharmony_ci
1211cabdff1aSopenharmony_ci/* Description : Interleave even word elements from vectors
1212cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1213cabdff1aSopenharmony_ci                 Outputs - out0, out1
1214cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1215cabdff1aSopenharmony_ci   Details     : Even word elements of 'in0' and even word
1216cabdff1aSopenharmony_ci                 elements of 'in1' are interleaved and copied to 'out0'
1217cabdff1aSopenharmony_ci                 Even word elements of 'in2' and even word
1218cabdff1aSopenharmony_ci                 elements of 'in3' are interleaved and copied to 'out1'
1219cabdff1aSopenharmony_ci*/
1220cabdff1aSopenharmony_ci#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1221cabdff1aSopenharmony_ci{                                                            \
1222cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
1223cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
1224cabdff1aSopenharmony_ci}
1225cabdff1aSopenharmony_ci#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1226cabdff1aSopenharmony_ci#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1227cabdff1aSopenharmony_ci#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1228cabdff1aSopenharmony_ci#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1229cabdff1aSopenharmony_ci
1230cabdff1aSopenharmony_ci/* Description : Interleave even double word elements from vectors
1231cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1232cabdff1aSopenharmony_ci                 Outputs - out0, out1
1233cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1234cabdff1aSopenharmony_ci   Details     : Even double word elements of 'in0' and even double word
1235cabdff1aSopenharmony_ci                 elements of 'in1' are interleaved and copied to 'out0'
1236cabdff1aSopenharmony_ci                 Even double word elements of 'in2' and even double word
1237cabdff1aSopenharmony_ci                 elements of 'in3' are interleaved and copied to 'out1'
1238cabdff1aSopenharmony_ci*/
1239cabdff1aSopenharmony_ci#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1240cabdff1aSopenharmony_ci{                                                            \
1241cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
1242cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
1243cabdff1aSopenharmony_ci}
1244cabdff1aSopenharmony_ci#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1245cabdff1aSopenharmony_ci#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1246cabdff1aSopenharmony_ci#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1247cabdff1aSopenharmony_ci
1248cabdff1aSopenharmony_ci/* Description : Interleave left half of byte elements from vectors
1249cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1250cabdff1aSopenharmony_ci                 Outputs - out0, out1
1251cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1252cabdff1aSopenharmony_ci   Details     : Left half of byte elements of in0 and left half of byte
1253cabdff1aSopenharmony_ci                 elements of in1 are interleaved and copied to out0.
1254cabdff1aSopenharmony_ci                 Left half of byte elements of in2 and left half of byte
1255cabdff1aSopenharmony_ci                 elements of in3 are interleaved and copied to out1.
1256cabdff1aSopenharmony_ci*/
1257cabdff1aSopenharmony_ci#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1258cabdff1aSopenharmony_ci{                                                           \
1259cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1260cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
1261cabdff1aSopenharmony_ci}
1262cabdff1aSopenharmony_ci#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1263cabdff1aSopenharmony_ci#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1264cabdff1aSopenharmony_ci#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1265cabdff1aSopenharmony_ci#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1266cabdff1aSopenharmony_ci
1267cabdff1aSopenharmony_ci#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1268cabdff1aSopenharmony_ci                out0, out1, out2, out3)                         \
1269cabdff1aSopenharmony_ci{                                                               \
1270cabdff1aSopenharmony_ci    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1271cabdff1aSopenharmony_ci    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1272cabdff1aSopenharmony_ci}
1273cabdff1aSopenharmony_ci#define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1274cabdff1aSopenharmony_ci#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1275cabdff1aSopenharmony_ci#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1276cabdff1aSopenharmony_ci#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1277cabdff1aSopenharmony_ci
1278cabdff1aSopenharmony_ci/* Description : Interleave left half of halfword elements from vectors
1279cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1280cabdff1aSopenharmony_ci                 Outputs - out0, out1
1281cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1282cabdff1aSopenharmony_ci   Details     : Left half of halfword elements of in0 and left half of halfword
1283cabdff1aSopenharmony_ci                 elements of in1 are interleaved and copied to out0.
1284cabdff1aSopenharmony_ci                 Left half of halfword elements of in2 and left half of halfword
1285cabdff1aSopenharmony_ci                 elements of in3 are interleaved and copied to out1.
1286cabdff1aSopenharmony_ci*/
1287cabdff1aSopenharmony_ci#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1288cabdff1aSopenharmony_ci{                                                           \
1289cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1290cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1291cabdff1aSopenharmony_ci}
1292cabdff1aSopenharmony_ci#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1293cabdff1aSopenharmony_ci#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1294cabdff1aSopenharmony_ci
1295cabdff1aSopenharmony_ci#define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1296cabdff1aSopenharmony_ci                out0, out1, out2, out3)                         \
1297cabdff1aSopenharmony_ci{                                                               \
1298cabdff1aSopenharmony_ci    ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1299cabdff1aSopenharmony_ci    ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1300cabdff1aSopenharmony_ci}
1301cabdff1aSopenharmony_ci#define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1302cabdff1aSopenharmony_ci#define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1303cabdff1aSopenharmony_ci
1304cabdff1aSopenharmony_ci/* Description : Interleave left half of word elements from vectors
1305cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1306cabdff1aSopenharmony_ci                 Outputs - out0, out1
1307cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1308cabdff1aSopenharmony_ci   Details     : Left half of word elements of in0 and left half of word
1309cabdff1aSopenharmony_ci                 elements of in1 are interleaved and copied to out0.
1310cabdff1aSopenharmony_ci                 Left half of word elements of in2 and left half of word
1311cabdff1aSopenharmony_ci                 elements of in3 are interleaved and copied to out1.
1312cabdff1aSopenharmony_ci*/
1313cabdff1aSopenharmony_ci#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1314cabdff1aSopenharmony_ci{                                                           \
1315cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1316cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1317cabdff1aSopenharmony_ci}
1318cabdff1aSopenharmony_ci#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1319cabdff1aSopenharmony_ci#define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1320cabdff1aSopenharmony_ci#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1321cabdff1aSopenharmony_ci
1322cabdff1aSopenharmony_ci/* Description : Interleave right half of byte elements from vectors
1323cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1324cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3
1325cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1326cabdff1aSopenharmony_ci   Details     : Right half of byte elements of in0 and right half of byte
1327cabdff1aSopenharmony_ci                 elements of in1 are interleaved and copied to out0.
1328cabdff1aSopenharmony_ci                 Right half of byte elements of in2 and right half of byte
1329cabdff1aSopenharmony_ci                 elements of in3 are interleaved and copied to out1.
1330cabdff1aSopenharmony_ci                 Similar for other pairs
1331cabdff1aSopenharmony_ci*/
1332cabdff1aSopenharmony_ci#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1333cabdff1aSopenharmony_ci{                                                           \
1334cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1335cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1336cabdff1aSopenharmony_ci}
1337cabdff1aSopenharmony_ci#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1338cabdff1aSopenharmony_ci#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1339cabdff1aSopenharmony_ci#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1340cabdff1aSopenharmony_ci#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1341cabdff1aSopenharmony_ci#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1342cabdff1aSopenharmony_ci
1343cabdff1aSopenharmony_ci#define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1344cabdff1aSopenharmony_ci{                                                                       \
1345cabdff1aSopenharmony_ci    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1346cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
1347cabdff1aSopenharmony_ci}
1348cabdff1aSopenharmony_ci#define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1349cabdff1aSopenharmony_ci#define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1350cabdff1aSopenharmony_ci#define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1351cabdff1aSopenharmony_ci#define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1352cabdff1aSopenharmony_ci
1353cabdff1aSopenharmony_ci#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1354cabdff1aSopenharmony_ci                out0, out1, out2, out3)                         \
1355cabdff1aSopenharmony_ci{                                                               \
1356cabdff1aSopenharmony_ci    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1357cabdff1aSopenharmony_ci    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1358cabdff1aSopenharmony_ci}
1359cabdff1aSopenharmony_ci#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1360cabdff1aSopenharmony_ci#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1361cabdff1aSopenharmony_ci#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1362cabdff1aSopenharmony_ci#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1363cabdff1aSopenharmony_ci#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1364cabdff1aSopenharmony_ci
1365cabdff1aSopenharmony_ci#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1366cabdff1aSopenharmony_ci                in8, in9, in10, in11, in12, in13, in14, in15,     \
1367cabdff1aSopenharmony_ci                out0, out1, out2, out3, out4, out5, out6, out7)   \
1368cabdff1aSopenharmony_ci{                                                                 \
1369cabdff1aSopenharmony_ci    ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
1370cabdff1aSopenharmony_ci            out0, out1, out2, out3);                              \
1371cabdff1aSopenharmony_ci    ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,  \
1372cabdff1aSopenharmony_ci            out4, out5, out6, out7);                              \
1373cabdff1aSopenharmony_ci}
1374cabdff1aSopenharmony_ci#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1375cabdff1aSopenharmony_ci#define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1376cabdff1aSopenharmony_ci
1377cabdff1aSopenharmony_ci/* Description : Interleave right half of halfword elements from vectors
1378cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1379cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3
1380cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1381cabdff1aSopenharmony_ci   Details     : Right half of halfword elements of in0 and right half of
1382cabdff1aSopenharmony_ci                 halfword elements of in1 are interleaved and copied to out0.
1383cabdff1aSopenharmony_ci                 Right half of halfword elements of in2 and right half of
1384cabdff1aSopenharmony_ci                 halfword elements of in3 are interleaved and copied to out1.
1385cabdff1aSopenharmony_ci                 Similar for other pairs
1386cabdff1aSopenharmony_ci*/
1387cabdff1aSopenharmony_ci#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1388cabdff1aSopenharmony_ci{                                                           \
1389cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1390cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1391cabdff1aSopenharmony_ci}
1392cabdff1aSopenharmony_ci#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1393cabdff1aSopenharmony_ci#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1394cabdff1aSopenharmony_ci
1395cabdff1aSopenharmony_ci#define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1396cabdff1aSopenharmony_ci{                                                                       \
1397cabdff1aSopenharmony_ci    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1398cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1399cabdff1aSopenharmony_ci}
1400cabdff1aSopenharmony_ci#define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1401cabdff1aSopenharmony_ci
1402cabdff1aSopenharmony_ci#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1403cabdff1aSopenharmony_ci                out0, out1, out2, out3)                         \
1404cabdff1aSopenharmony_ci{                                                               \
1405cabdff1aSopenharmony_ci    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1406cabdff1aSopenharmony_ci    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1407cabdff1aSopenharmony_ci}
1408cabdff1aSopenharmony_ci#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1409cabdff1aSopenharmony_ci#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1410cabdff1aSopenharmony_ci
1411cabdff1aSopenharmony_ci#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1412cabdff1aSopenharmony_ci{                                                           \
1413cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1414cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1415cabdff1aSopenharmony_ci}
1416cabdff1aSopenharmony_ci#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1417cabdff1aSopenharmony_ci#define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1418cabdff1aSopenharmony_ci#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1419cabdff1aSopenharmony_ci
1420cabdff1aSopenharmony_ci#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1421cabdff1aSopenharmony_ci                out0, out1, out2, out3)                         \
1422cabdff1aSopenharmony_ci{                                                               \
1423cabdff1aSopenharmony_ci    ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1424cabdff1aSopenharmony_ci    ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1425cabdff1aSopenharmony_ci}
1426cabdff1aSopenharmony_ci#define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1427cabdff1aSopenharmony_ci#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1428cabdff1aSopenharmony_ci
1429cabdff1aSopenharmony_ci/* Description : Interleave right half of double word elements from vectors
1430cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1431cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3
1432cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1433cabdff1aSopenharmony_ci   Details     : Right half of double word elements of in0 and right half of
1434cabdff1aSopenharmony_ci                 double word elements of in1 are interleaved and copied to out0.
1435cabdff1aSopenharmony_ci                 Right half of double word elements of in2 and right half of
1436cabdff1aSopenharmony_ci                 double word elements of in3 are interleaved and copied to out1.
1437cabdff1aSopenharmony_ci*/
1438cabdff1aSopenharmony_ci#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1439cabdff1aSopenharmony_ci{                                                           \
1440cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
1441cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3);  \
1442cabdff1aSopenharmony_ci}
1443cabdff1aSopenharmony_ci#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1444cabdff1aSopenharmony_ci#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1445cabdff1aSopenharmony_ci#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1446cabdff1aSopenharmony_ci
1447cabdff1aSopenharmony_ci#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1448cabdff1aSopenharmony_ci{                                                                       \
1449cabdff1aSopenharmony_ci    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1450cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5);              \
1451cabdff1aSopenharmony_ci}
1452cabdff1aSopenharmony_ci#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1453cabdff1aSopenharmony_ci
1454cabdff1aSopenharmony_ci#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1455cabdff1aSopenharmony_ci                out0, out1, out2, out3)                         \
1456cabdff1aSopenharmony_ci{                                                               \
1457cabdff1aSopenharmony_ci    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1458cabdff1aSopenharmony_ci    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1459cabdff1aSopenharmony_ci}
1460cabdff1aSopenharmony_ci#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1461cabdff1aSopenharmony_ci#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1462cabdff1aSopenharmony_ci
1463cabdff1aSopenharmony_ci/* Description : Interleave left half of double word elements from vectors
1464cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1465cabdff1aSopenharmony_ci                 Outputs - out0, out1
1466cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1467cabdff1aSopenharmony_ci   Details     : Left half of double word elements of in0 and left half of
1468cabdff1aSopenharmony_ci                 double word elements of in1 are interleaved and copied to out0.
1469cabdff1aSopenharmony_ci                 Left half of double word elements of in2 and left half of
1470cabdff1aSopenharmony_ci                 double word elements of in3 are interleaved and copied to out1.
1471cabdff1aSopenharmony_ci*/
1472cabdff1aSopenharmony_ci#define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1473cabdff1aSopenharmony_ci{                                                           \
1474cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
1475cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3);  \
1476cabdff1aSopenharmony_ci}
1477cabdff1aSopenharmony_ci#define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1478cabdff1aSopenharmony_ci#define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1479cabdff1aSopenharmony_ci#define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1480cabdff1aSopenharmony_ci
1481cabdff1aSopenharmony_ci/* Description : Interleave both left and right half of input vectors
1482cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1
1483cabdff1aSopenharmony_ci                 Outputs - out0, out1
1484cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1485cabdff1aSopenharmony_ci   Details     : Right half of byte elements from 'in0' and 'in1' are
1486cabdff1aSopenharmony_ci                 interleaved and stored to 'out0'
1487cabdff1aSopenharmony_ci                 Left half of byte elements from 'in0' and 'in1' are
1488cabdff1aSopenharmony_ci                 interleaved and stored to 'out1'
1489cabdff1aSopenharmony_ci*/
1490cabdff1aSopenharmony_ci#define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1491cabdff1aSopenharmony_ci{                                                           \
1492cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1493cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1494cabdff1aSopenharmony_ci}
1495cabdff1aSopenharmony_ci#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1496cabdff1aSopenharmony_ci#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1497cabdff1aSopenharmony_ci#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1498cabdff1aSopenharmony_ci#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1499cabdff1aSopenharmony_ci#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1500cabdff1aSopenharmony_ci
1501cabdff1aSopenharmony_ci#define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1502cabdff1aSopenharmony_ci{                                                           \
1503cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1504cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1505cabdff1aSopenharmony_ci}
1506cabdff1aSopenharmony_ci#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1507cabdff1aSopenharmony_ci#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1508cabdff1aSopenharmony_ci#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1509cabdff1aSopenharmony_ci#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1510cabdff1aSopenharmony_ci
1511cabdff1aSopenharmony_ci#define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1512cabdff1aSopenharmony_ci{                                                           \
1513cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1514cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1515cabdff1aSopenharmony_ci}
1516cabdff1aSopenharmony_ci#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1517cabdff1aSopenharmony_ci#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1518cabdff1aSopenharmony_ci#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1519cabdff1aSopenharmony_ci
1520cabdff1aSopenharmony_ci/* Description : Maximum values between signed elements of vector and
1521cabdff1aSopenharmony_ci                 5-bit signed immediate value are copied to the output vector
1522cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, max_val
1523cabdff1aSopenharmony_ci                 Outputs - in0, in1, in2, in3 (in place)
1524cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1525cabdff1aSopenharmony_ci   Details     : Maximum of signed halfword element values from 'in0' and
1526cabdff1aSopenharmony_ci                 'max_val' are written to output vector 'in0'
1527cabdff1aSopenharmony_ci*/
1528cabdff1aSopenharmony_ci#define MAXI_SH2(RTYPE, in0, in1, max_val)               \
1529cabdff1aSopenharmony_ci{                                                        \
1530cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val);  \
1531cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val);  \
1532cabdff1aSopenharmony_ci}
1533cabdff1aSopenharmony_ci#define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1534cabdff1aSopenharmony_ci#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1535cabdff1aSopenharmony_ci
1536cabdff1aSopenharmony_ci#define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1537cabdff1aSopenharmony_ci{                                                     \
1538cabdff1aSopenharmony_ci    MAXI_SH2(RTYPE, in0, in1, max_val);               \
1539cabdff1aSopenharmony_ci    MAXI_SH2(RTYPE, in2, in3, max_val);               \
1540cabdff1aSopenharmony_ci}
1541cabdff1aSopenharmony_ci#define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1542cabdff1aSopenharmony_ci#define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1543cabdff1aSopenharmony_ci
1544cabdff1aSopenharmony_ci#define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val)  \
1545cabdff1aSopenharmony_ci{                                                                         \
1546cabdff1aSopenharmony_ci    MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val);                         \
1547cabdff1aSopenharmony_ci    MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val);                         \
1548cabdff1aSopenharmony_ci}
1549cabdff1aSopenharmony_ci#define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1550cabdff1aSopenharmony_ci#define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1551cabdff1aSopenharmony_ci
1552cabdff1aSopenharmony_ci/* Description : Saturate the halfword element values to the max
1553cabdff1aSopenharmony_ci                 unsigned value of (sat_val+1 bits)
1554cabdff1aSopenharmony_ci                 The element data width remains unchanged
1555cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1556cabdff1aSopenharmony_ci                 Outputs - in0, in1, in2, in3 (in place)
1557cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1558cabdff1aSopenharmony_ci   Details     : Each unsigned halfword element from 'in0' is saturated to the
1559cabdff1aSopenharmony_ci                 value generated with (sat_val+1) bit range
1560cabdff1aSopenharmony_ci                 Results are in placed to original vectors
1561cabdff1aSopenharmony_ci*/
1562cabdff1aSopenharmony_ci#define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1563cabdff1aSopenharmony_ci{                                                       \
1564cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1565cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1566cabdff1aSopenharmony_ci}
1567cabdff1aSopenharmony_ci#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1568cabdff1aSopenharmony_ci#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1569cabdff1aSopenharmony_ci
1570cabdff1aSopenharmony_ci#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1571cabdff1aSopenharmony_ci{                                                    \
1572cabdff1aSopenharmony_ci    SAT_UH2(RTYPE, in0, in1, sat_val);               \
1573cabdff1aSopenharmony_ci    SAT_UH2(RTYPE, in2, in3, sat_val);               \
1574cabdff1aSopenharmony_ci}
1575cabdff1aSopenharmony_ci#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1576cabdff1aSopenharmony_ci#define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1577cabdff1aSopenharmony_ci
1578cabdff1aSopenharmony_ci#define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val)  \
1579cabdff1aSopenharmony_ci{                                                                        \
1580cabdff1aSopenharmony_ci    SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val);                         \
1581cabdff1aSopenharmony_ci    SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val);                         \
1582cabdff1aSopenharmony_ci}
1583cabdff1aSopenharmony_ci#define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1584cabdff1aSopenharmony_ci#define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1585cabdff1aSopenharmony_ci
1586cabdff1aSopenharmony_ci/* Description : Saturate the halfword element values to the max
1587cabdff1aSopenharmony_ci                 unsigned value of (sat_val+1 bits)
1588cabdff1aSopenharmony_ci                 The element data width remains unchanged
1589cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1590cabdff1aSopenharmony_ci                 Outputs - in0, in1, in2, in3 (in place)
1591cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1592cabdff1aSopenharmony_ci   Details     : Each unsigned halfword element from 'in0' is saturated to the
1593cabdff1aSopenharmony_ci                 value generated with (sat_val+1) bit range
1594cabdff1aSopenharmony_ci                 Results are in placed to original vectors
1595cabdff1aSopenharmony_ci*/
1596cabdff1aSopenharmony_ci#define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1597cabdff1aSopenharmony_ci{                                                       \
1598cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1599cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1600cabdff1aSopenharmony_ci}
1601cabdff1aSopenharmony_ci#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1602cabdff1aSopenharmony_ci
1603cabdff1aSopenharmony_ci#define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1604cabdff1aSopenharmony_ci{                                                       \
1605cabdff1aSopenharmony_ci    SAT_SH2(RTYPE, in0, in1, sat_val);                  \
1606cabdff1aSopenharmony_ci    in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1607cabdff1aSopenharmony_ci}
1608cabdff1aSopenharmony_ci#define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1609cabdff1aSopenharmony_ci
1610cabdff1aSopenharmony_ci#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1611cabdff1aSopenharmony_ci{                                                    \
1612cabdff1aSopenharmony_ci    SAT_SH2(RTYPE, in0, in1, sat_val);               \
1613cabdff1aSopenharmony_ci    SAT_SH2(RTYPE, in2, in3, sat_val);               \
1614cabdff1aSopenharmony_ci}
1615cabdff1aSopenharmony_ci#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1616cabdff1aSopenharmony_ci
1617cabdff1aSopenharmony_ci/* Description : Saturate the word element values to the max
1618cabdff1aSopenharmony_ci                 unsigned value of (sat_val+1 bits)
1619cabdff1aSopenharmony_ci                 The element data width remains unchanged
1620cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1621cabdff1aSopenharmony_ci                 Outputs - in0, in1, in2, in3 (in place)
1622cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1623cabdff1aSopenharmony_ci   Details     : Each unsigned word element from 'in0' is saturated to the
1624cabdff1aSopenharmony_ci                 value generated with (sat_val+1) bit range
1625cabdff1aSopenharmony_ci                 Results are in placed to original vectors
1626cabdff1aSopenharmony_ci*/
1627cabdff1aSopenharmony_ci#define SAT_SW2(RTYPE, in0, in1, sat_val)               \
1628cabdff1aSopenharmony_ci{                                                       \
1629cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val);  \
1630cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val);  \
1631cabdff1aSopenharmony_ci}
1632cabdff1aSopenharmony_ci#define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1633cabdff1aSopenharmony_ci
1634cabdff1aSopenharmony_ci#define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val)  \
1635cabdff1aSopenharmony_ci{                                                    \
1636cabdff1aSopenharmony_ci    SAT_SW2(RTYPE, in0, in1, sat_val);               \
1637cabdff1aSopenharmony_ci    SAT_SW2(RTYPE, in2, in3, sat_val);               \
1638cabdff1aSopenharmony_ci}
1639cabdff1aSopenharmony_ci#define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1640cabdff1aSopenharmony_ci
1641cabdff1aSopenharmony_ci/* Description : Indexed halfword element values are replicated to all
1642cabdff1aSopenharmony_ci                 elements in output vector
1643cabdff1aSopenharmony_ci   Arguments   : Inputs  - in, idx0, idx1
1644cabdff1aSopenharmony_ci                 Outputs - out0, out1
1645cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1646cabdff1aSopenharmony_ci   Details     : 'idx0' element value from 'in' vector is replicated to all
1647cabdff1aSopenharmony_ci                  elements in 'out0' vector
1648cabdff1aSopenharmony_ci                  Valid index range for halfword operation is 0-7
1649cabdff1aSopenharmony_ci*/
1650cabdff1aSopenharmony_ci#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1651cabdff1aSopenharmony_ci{                                                     \
1652cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1653cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1654cabdff1aSopenharmony_ci}
1655cabdff1aSopenharmony_ci#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1656cabdff1aSopenharmony_ci#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1657cabdff1aSopenharmony_ci
1658cabdff1aSopenharmony_ci#define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,        \
1659cabdff1aSopenharmony_ci                  out0, out1, out2)                   \
1660cabdff1aSopenharmony_ci{                                                     \
1661cabdff1aSopenharmony_ci    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1662cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2);  \
1663cabdff1aSopenharmony_ci}
1664cabdff1aSopenharmony_ci#define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1665cabdff1aSopenharmony_ci#define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1666cabdff1aSopenharmony_ci
1667cabdff1aSopenharmony_ci#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1668cabdff1aSopenharmony_ci                  out0, out1, out2, out3)             \
1669cabdff1aSopenharmony_ci{                                                     \
1670cabdff1aSopenharmony_ci    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1671cabdff1aSopenharmony_ci    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1672cabdff1aSopenharmony_ci}
1673cabdff1aSopenharmony_ci#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1674cabdff1aSopenharmony_ci#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1675cabdff1aSopenharmony_ci
1676cabdff1aSopenharmony_ci/* Description : Indexed word element values are replicated to all
1677cabdff1aSopenharmony_ci                 elements in output vector
1678cabdff1aSopenharmony_ci   Arguments   : Inputs  - in, stidx
1679cabdff1aSopenharmony_ci                 Outputs - out0, out1
1680cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1681cabdff1aSopenharmony_ci   Details     : 'stidx' element value from 'in' vector is replicated to all
1682cabdff1aSopenharmony_ci                  elements in 'out0' vector
1683cabdff1aSopenharmony_ci                 'stidx + 1' element value from 'in' vector is replicated to all
1684cabdff1aSopenharmony_ci                  elements in 'out1' vector
1685cabdff1aSopenharmony_ci                  Valid index range for halfword operation is 0-3
1686cabdff1aSopenharmony_ci*/
1687cabdff1aSopenharmony_ci#define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1688cabdff1aSopenharmony_ci{                                                          \
1689cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1690cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1691cabdff1aSopenharmony_ci}
1692cabdff1aSopenharmony_ci#define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1693cabdff1aSopenharmony_ci#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1694cabdff1aSopenharmony_ci
1695cabdff1aSopenharmony_ci#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1696cabdff1aSopenharmony_ci{                                                     \
1697cabdff1aSopenharmony_ci    SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1698cabdff1aSopenharmony_ci    SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1699cabdff1aSopenharmony_ci}
1700cabdff1aSopenharmony_ci#define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1701cabdff1aSopenharmony_ci#define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1702cabdff1aSopenharmony_ci
1703cabdff1aSopenharmony_ci/* Description : Pack even byte elements of vector pairs
1704cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1705cabdff1aSopenharmony_ci                 Outputs - out0, out1
1706cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1707cabdff1aSopenharmony_ci   Details     : Even byte elements of in0 are copied to the left half of
1708cabdff1aSopenharmony_ci                 out0 & even byte elements of in1 are copied to the right
1709cabdff1aSopenharmony_ci                 half of out0.
1710cabdff1aSopenharmony_ci                 Even byte elements of in2 are copied to the left half of
1711cabdff1aSopenharmony_ci                 out1 & even byte elements of in3 are copied to the right
1712cabdff1aSopenharmony_ci                 half of out1.
1713cabdff1aSopenharmony_ci*/
1714cabdff1aSopenharmony_ci#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1715cabdff1aSopenharmony_ci{                                                            \
1716cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1717cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1718cabdff1aSopenharmony_ci}
1719cabdff1aSopenharmony_ci#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1720cabdff1aSopenharmony_ci#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1721cabdff1aSopenharmony_ci#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1722cabdff1aSopenharmony_ci#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1723cabdff1aSopenharmony_ci
1724cabdff1aSopenharmony_ci#define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1725cabdff1aSopenharmony_ci{                                                                        \
1726cabdff1aSopenharmony_ci    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1727cabdff1aSopenharmony_ci    out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1728cabdff1aSopenharmony_ci}
1729cabdff1aSopenharmony_ci#define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1730cabdff1aSopenharmony_ci#define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1731cabdff1aSopenharmony_ci
1732cabdff1aSopenharmony_ci#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1733cabdff1aSopenharmony_ci                 out0, out1, out2, out3)                         \
1734cabdff1aSopenharmony_ci{                                                                \
1735cabdff1aSopenharmony_ci    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1736cabdff1aSopenharmony_ci    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1737cabdff1aSopenharmony_ci}
1738cabdff1aSopenharmony_ci#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1739cabdff1aSopenharmony_ci#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1740cabdff1aSopenharmony_ci#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1741cabdff1aSopenharmony_ci#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1742cabdff1aSopenharmony_ci
1743cabdff1aSopenharmony_ci/* Description : Pack even halfword elements of vector pairs
1744cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1745cabdff1aSopenharmony_ci                 Outputs - out0, out1
1746cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1747cabdff1aSopenharmony_ci   Details     : Even halfword elements of in0 are copied to the left half of
1748cabdff1aSopenharmony_ci                 out0 & even halfword elements of in1 are copied to the right
1749cabdff1aSopenharmony_ci                 half of out0.
1750cabdff1aSopenharmony_ci                 Even halfword elements of in2 are copied to the left half of
1751cabdff1aSopenharmony_ci                 out1 & even halfword elements of in3 are copied to the right
1752cabdff1aSopenharmony_ci                 half of out1.
1753cabdff1aSopenharmony_ci*/
1754cabdff1aSopenharmony_ci#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1755cabdff1aSopenharmony_ci{                                                            \
1756cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1757cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1758cabdff1aSopenharmony_ci}
1759cabdff1aSopenharmony_ci#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1760cabdff1aSopenharmony_ci#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1761cabdff1aSopenharmony_ci
1762cabdff1aSopenharmony_ci#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1763cabdff1aSopenharmony_ci                 out0, out1, out2, out3)                         \
1764cabdff1aSopenharmony_ci{                                                                \
1765cabdff1aSopenharmony_ci    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1766cabdff1aSopenharmony_ci    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1767cabdff1aSopenharmony_ci}
1768cabdff1aSopenharmony_ci#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1769cabdff1aSopenharmony_ci#define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1770cabdff1aSopenharmony_ci
1771cabdff1aSopenharmony_ci/* Description : Pack even double word elements of vector pairs
1772cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1773cabdff1aSopenharmony_ci                 Outputs - out0, out1
1774cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1775cabdff1aSopenharmony_ci   Details     : Even double elements of in0 are copied to the left half of
1776cabdff1aSopenharmony_ci                 out0 & even double elements of in1 are copied to the right
1777cabdff1aSopenharmony_ci                 half of out0.
1778cabdff1aSopenharmony_ci                 Even double elements of in2 are copied to the left half of
1779cabdff1aSopenharmony_ci                 out1 & even double elements of in3 are copied to the right
1780cabdff1aSopenharmony_ci                 half of out1.
1781cabdff1aSopenharmony_ci*/
1782cabdff1aSopenharmony_ci#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1783cabdff1aSopenharmony_ci{                                                            \
1784cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
1785cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
1786cabdff1aSopenharmony_ci}
1787cabdff1aSopenharmony_ci#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1788cabdff1aSopenharmony_ci#define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1789cabdff1aSopenharmony_ci#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1790cabdff1aSopenharmony_ci
1791cabdff1aSopenharmony_ci#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1792cabdff1aSopenharmony_ci                 out0, out1, out2, out3)                         \
1793cabdff1aSopenharmony_ci{                                                                \
1794cabdff1aSopenharmony_ci    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1795cabdff1aSopenharmony_ci    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1796cabdff1aSopenharmony_ci}
1797cabdff1aSopenharmony_ci#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1798cabdff1aSopenharmony_ci
1799cabdff1aSopenharmony_ci/* Description : Pack odd double word elements of vector pairs
1800cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1
1801cabdff1aSopenharmony_ci                 Outputs - out0, out1
1802cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1803cabdff1aSopenharmony_ci   Details     : As operation is on same input 'in0' vector, index 1 double word
1804cabdff1aSopenharmony_ci                 element is overwritten to index 0 and result is written to out0
1805cabdff1aSopenharmony_ci                 As operation is on same input 'in1' vector, index 1 double word
1806cabdff1aSopenharmony_ci                 element is overwritten to index 0 and result is written to out1
1807cabdff1aSopenharmony_ci*/
1808cabdff1aSopenharmony_ci#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1809cabdff1aSopenharmony_ci{                                                            \
1810cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
1811cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3);  \
1812cabdff1aSopenharmony_ci}
1813cabdff1aSopenharmony_ci#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1814cabdff1aSopenharmony_ci#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1815cabdff1aSopenharmony_ci#define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1816cabdff1aSopenharmony_ci
1817cabdff1aSopenharmony_ci/* Description : Each byte element is logically xor'ed with immediate 128
1818cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1
1819cabdff1aSopenharmony_ci                 Outputs - in0, in1 (in-place)
1820cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1821cabdff1aSopenharmony_ci   Details     : Each unsigned byte element from input vector 'in0' is
1822cabdff1aSopenharmony_ci                 logically xor'ed with 128 and result is in-place stored in
1823cabdff1aSopenharmony_ci                 'in0' vector
1824cabdff1aSopenharmony_ci                 Each unsigned byte element from input vector 'in1' is
1825cabdff1aSopenharmony_ci                 logically xor'ed with 128 and result is in-place stored in
1826cabdff1aSopenharmony_ci                 'in1' vector
1827cabdff1aSopenharmony_ci                 Similar for other pairs
1828cabdff1aSopenharmony_ci*/
1829cabdff1aSopenharmony_ci#define XORI_B2_128(RTYPE, in0, in1)               \
1830cabdff1aSopenharmony_ci{                                                  \
1831cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1832cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1833cabdff1aSopenharmony_ci}
1834cabdff1aSopenharmony_ci#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1835cabdff1aSopenharmony_ci#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1836cabdff1aSopenharmony_ci#define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1837cabdff1aSopenharmony_ci
1838cabdff1aSopenharmony_ci#define XORI_B3_128(RTYPE, in0, in1, in2)          \
1839cabdff1aSopenharmony_ci{                                                  \
1840cabdff1aSopenharmony_ci    XORI_B2_128(RTYPE, in0, in1);                  \
1841cabdff1aSopenharmony_ci    in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1842cabdff1aSopenharmony_ci}
1843cabdff1aSopenharmony_ci#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1844cabdff1aSopenharmony_ci
1845cabdff1aSopenharmony_ci#define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1846cabdff1aSopenharmony_ci{                                               \
1847cabdff1aSopenharmony_ci    XORI_B2_128(RTYPE, in0, in1);               \
1848cabdff1aSopenharmony_ci    XORI_B2_128(RTYPE, in2, in3);               \
1849cabdff1aSopenharmony_ci}
1850cabdff1aSopenharmony_ci#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1851cabdff1aSopenharmony_ci#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1852cabdff1aSopenharmony_ci#define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1853cabdff1aSopenharmony_ci
1854cabdff1aSopenharmony_ci#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1855cabdff1aSopenharmony_ci{                                                    \
1856cabdff1aSopenharmony_ci    XORI_B3_128(RTYPE, in0, in1, in2);               \
1857cabdff1aSopenharmony_ci    XORI_B2_128(RTYPE, in3, in4);                    \
1858cabdff1aSopenharmony_ci}
1859cabdff1aSopenharmony_ci#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1860cabdff1aSopenharmony_ci
1861cabdff1aSopenharmony_ci#define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
1862cabdff1aSopenharmony_ci{                                                         \
1863cabdff1aSopenharmony_ci    XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
1864cabdff1aSopenharmony_ci    XORI_B2_128(RTYPE, in4, in5);                         \
1865cabdff1aSopenharmony_ci}
1866cabdff1aSopenharmony_ci#define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1867cabdff1aSopenharmony_ci
1868cabdff1aSopenharmony_ci#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1869cabdff1aSopenharmony_ci{                                                              \
1870cabdff1aSopenharmony_ci    XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1871cabdff1aSopenharmony_ci    XORI_B3_128(RTYPE, in4, in5, in6);                         \
1872cabdff1aSopenharmony_ci}
1873cabdff1aSopenharmony_ci#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1874cabdff1aSopenharmony_ci
1875cabdff1aSopenharmony_ci#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1876cabdff1aSopenharmony_ci{                                                                   \
1877cabdff1aSopenharmony_ci    XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1878cabdff1aSopenharmony_ci    XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1879cabdff1aSopenharmony_ci}
1880cabdff1aSopenharmony_ci#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1881cabdff1aSopenharmony_ci#define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1882cabdff1aSopenharmony_ci
1883cabdff1aSopenharmony_ci/* Description : Addition of signed halfword elements and signed saturation
1884cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
1885cabdff1aSopenharmony_ci                 Outputs - out0, out1
1886cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1887cabdff1aSopenharmony_ci   Details     : Signed halfword elements from 'in0' are added to signed
1888cabdff1aSopenharmony_ci                 halfword elements of 'in1'. The result is then signed saturated
1889cabdff1aSopenharmony_ci                 between -32768 to +32767 (as per halfword data type)
1890cabdff1aSopenharmony_ci                 Similar for other pairs
1891cabdff1aSopenharmony_ci*/
1892cabdff1aSopenharmony_ci#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
1893cabdff1aSopenharmony_ci{                                                             \
1894cabdff1aSopenharmony_ci    out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
1895cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
1896cabdff1aSopenharmony_ci}
1897cabdff1aSopenharmony_ci#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1898cabdff1aSopenharmony_ci
1899cabdff1aSopenharmony_ci#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1900cabdff1aSopenharmony_ci                 out0, out1, out2, out3)                         \
1901cabdff1aSopenharmony_ci{                                                                \
1902cabdff1aSopenharmony_ci    ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1903cabdff1aSopenharmony_ci    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1904cabdff1aSopenharmony_ci}
1905cabdff1aSopenharmony_ci#define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1906cabdff1aSopenharmony_ci#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1907cabdff1aSopenharmony_ci
1908cabdff1aSopenharmony_ci/* Description : Shift left all elements of vector (generic for all data types)
1909cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, shift
1910cabdff1aSopenharmony_ci                 Outputs - in0, in1, in2, in3 (in place)
1911cabdff1aSopenharmony_ci                 Return Type - as per input vector RTYPE
1912cabdff1aSopenharmony_ci   Details     : Each element of vector 'in0' is left shifted by 'shift' and
1913cabdff1aSopenharmony_ci                 result is in place written to 'in0'
1914cabdff1aSopenharmony_ci                 Similar for other pairs
1915cabdff1aSopenharmony_ci*/
1916cabdff1aSopenharmony_ci#define SLLI_2V(in0, in1, shift)  \
1917cabdff1aSopenharmony_ci{                                 \
1918cabdff1aSopenharmony_ci    in0 = in0 << shift;           \
1919cabdff1aSopenharmony_ci    in1 = in1 << shift;           \
1920cabdff1aSopenharmony_ci}
1921cabdff1aSopenharmony_ci#define SLLI_4V(in0, in1, in2, in3, shift)  \
1922cabdff1aSopenharmony_ci{                                           \
1923cabdff1aSopenharmony_ci    in0 = in0 << shift;                     \
1924cabdff1aSopenharmony_ci    in1 = in1 << shift;                     \
1925cabdff1aSopenharmony_ci    in2 = in2 << shift;                     \
1926cabdff1aSopenharmony_ci    in3 = in3 << shift;                     \
1927cabdff1aSopenharmony_ci}
1928cabdff1aSopenharmony_ci
1929cabdff1aSopenharmony_ci/* Description : Arithmetic shift right all elements of vector
1930cabdff1aSopenharmony_ci                 (generic for all data types)
1931cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, shift
1932cabdff1aSopenharmony_ci                 Outputs - in0, in1, in2, in3 (in place)
1933cabdff1aSopenharmony_ci                 Return Type - as per input vector RTYPE
1934cabdff1aSopenharmony_ci   Details     : Each element of vector 'in0' is right shifted by 'shift' and
1935cabdff1aSopenharmony_ci                 result is in place written to 'in0'
1936cabdff1aSopenharmony_ci                 Here, 'shift' is GP variable passed in
1937cabdff1aSopenharmony_ci                 Similar for other pairs
1938cabdff1aSopenharmony_ci*/
1939cabdff1aSopenharmony_ci#define SRA_4V(in0, in1, in2, in3, shift)  \
1940cabdff1aSopenharmony_ci{                                          \
1941cabdff1aSopenharmony_ci    in0 = in0 >> shift;                    \
1942cabdff1aSopenharmony_ci    in1 = in1 >> shift;                    \
1943cabdff1aSopenharmony_ci    in2 = in2 >> shift;                    \
1944cabdff1aSopenharmony_ci    in3 = in3 >> shift;                    \
1945cabdff1aSopenharmony_ci}
1946cabdff1aSopenharmony_ci
1947cabdff1aSopenharmony_ci/* Description : Shift right logical all halfword elements of vector
1948cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, shift
1949cabdff1aSopenharmony_ci                 Outputs - in0, in1, in2, in3 (in place)
1950cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1951cabdff1aSopenharmony_ci   Details     : Each element of vector 'in0' is shifted right logical by
1952cabdff1aSopenharmony_ci                 number of bits respective element holds in vector 'shift' and
1953cabdff1aSopenharmony_ci                 result is in place written to 'in0'
1954cabdff1aSopenharmony_ci                 Here, 'shift' is a vector passed in
1955cabdff1aSopenharmony_ci                 Similar for other pairs
1956cabdff1aSopenharmony_ci*/
1957cabdff1aSopenharmony_ci#define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
1958cabdff1aSopenharmony_ci{                                                           \
1959cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
1960cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
1961cabdff1aSopenharmony_ci    in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
1962cabdff1aSopenharmony_ci    in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
1963cabdff1aSopenharmony_ci}
1964cabdff1aSopenharmony_ci#define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1965cabdff1aSopenharmony_ci
1966cabdff1aSopenharmony_ci#define SRLR_H4(RTYPE, in0, in1, in2, in3, shift)            \
1967cabdff1aSopenharmony_ci{                                                            \
1968cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift);  \
1969cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift);  \
1970cabdff1aSopenharmony_ci    in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift);  \
1971cabdff1aSopenharmony_ci    in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift);  \
1972cabdff1aSopenharmony_ci}
1973cabdff1aSopenharmony_ci#define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1974cabdff1aSopenharmony_ci#define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1975cabdff1aSopenharmony_ci
1976cabdff1aSopenharmony_ci#define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
1977cabdff1aSopenharmony_ci{                                                                      \
1978cabdff1aSopenharmony_ci    SRLR_H4(RTYPE, in0, in1, in2, in3, shift);                         \
1979cabdff1aSopenharmony_ci    SRLR_H4(RTYPE, in4, in5, in6, in7, shift);                         \
1980cabdff1aSopenharmony_ci}
1981cabdff1aSopenharmony_ci#define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1982cabdff1aSopenharmony_ci#define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1983cabdff1aSopenharmony_ci
1984cabdff1aSopenharmony_ci/* Description : Shift right arithmetic rounded halfwords
1985cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, shift
1986cabdff1aSopenharmony_ci                 Outputs - in0, in1, (in place)
1987cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
1988cabdff1aSopenharmony_ci   Details     : Each element of vector 'in0' is shifted right arithmetic by
1989cabdff1aSopenharmony_ci                 number of bits respective element holds in vector 'shift'.
1990cabdff1aSopenharmony_ci                 The last discarded bit is added to shifted value for rounding
1991cabdff1aSopenharmony_ci                 and the result is in place written to 'in0'
1992cabdff1aSopenharmony_ci                 Here, 'shift' is a vector passed in
1993cabdff1aSopenharmony_ci                 Similar for other pairs
1994cabdff1aSopenharmony_ci*/
1995cabdff1aSopenharmony_ci#define SRAR_H2(RTYPE, in0, in1, shift)                      \
1996cabdff1aSopenharmony_ci{                                                            \
1997cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
1998cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
1999cabdff1aSopenharmony_ci}
2000cabdff1aSopenharmony_ci#define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2001cabdff1aSopenharmony_ci#define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2002cabdff1aSopenharmony_ci
2003cabdff1aSopenharmony_ci#define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
2004cabdff1aSopenharmony_ci{                                                            \
2005cabdff1aSopenharmony_ci    SRAR_H2(RTYPE, in0, in1, shift)                          \
2006cabdff1aSopenharmony_ci    in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
2007cabdff1aSopenharmony_ci}
2008cabdff1aSopenharmony_ci#define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2009cabdff1aSopenharmony_ci
2010cabdff1aSopenharmony_ci#define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
2011cabdff1aSopenharmony_ci{                                                  \
2012cabdff1aSopenharmony_ci    SRAR_H2(RTYPE, in0, in1, shift)                \
2013cabdff1aSopenharmony_ci    SRAR_H2(RTYPE, in2, in3, shift)                \
2014cabdff1aSopenharmony_ci}
2015cabdff1aSopenharmony_ci#define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2016cabdff1aSopenharmony_ci#define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2017cabdff1aSopenharmony_ci
2018cabdff1aSopenharmony_ci/* Description : Shift right arithmetic rounded words
2019cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, shift
2020cabdff1aSopenharmony_ci                 Outputs - in0, in1, (in place)
2021cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
2022cabdff1aSopenharmony_ci   Details     : Each element of vector 'in0' is shifted right arithmetic by
2023cabdff1aSopenharmony_ci                 number of bits respective element holds in vector 'shift'.
2024cabdff1aSopenharmony_ci                 The last discarded bit is added to shifted value for rounding
2025cabdff1aSopenharmony_ci                 and the result is in place written to 'in0'
2026cabdff1aSopenharmony_ci                 Here, 'shift' is a vector passed in
2027cabdff1aSopenharmony_ci                 Similar for other pairs
2028cabdff1aSopenharmony_ci*/
2029cabdff1aSopenharmony_ci#define SRAR_W2(RTYPE, in0, in1, shift)                      \
2030cabdff1aSopenharmony_ci{                                                            \
2031cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
2032cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
2033cabdff1aSopenharmony_ci}
2034cabdff1aSopenharmony_ci#define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2035cabdff1aSopenharmony_ci
2036cabdff1aSopenharmony_ci#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
2037cabdff1aSopenharmony_ci{                                                  \
2038cabdff1aSopenharmony_ci    SRAR_W2(RTYPE, in0, in1, shift)                \
2039cabdff1aSopenharmony_ci    SRAR_W2(RTYPE, in2, in3, shift)                \
2040cabdff1aSopenharmony_ci}
2041cabdff1aSopenharmony_ci#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2042cabdff1aSopenharmony_ci
2043cabdff1aSopenharmony_ci/* Description : Shift right arithmetic rounded (immediate)
2044cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, shift
2045cabdff1aSopenharmony_ci                 Outputs - in0, in1, in2, in3 (in place)
2046cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
2047cabdff1aSopenharmony_ci   Details     : Each element of vector 'in0' is shifted right arithmetic by
2048cabdff1aSopenharmony_ci                 value in 'shift'.
2049cabdff1aSopenharmony_ci                 The last discarded bit is added to shifted value for rounding
2050cabdff1aSopenharmony_ci                 and the result is in place written to 'in0'
2051cabdff1aSopenharmony_ci                 Similar for other pairs
2052cabdff1aSopenharmony_ci*/
2053cabdff1aSopenharmony_ci#define SRARI_H2(RTYPE, in0, in1, shift)              \
2054cabdff1aSopenharmony_ci{                                                     \
2055cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
2056cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
2057cabdff1aSopenharmony_ci}
2058cabdff1aSopenharmony_ci#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2059cabdff1aSopenharmony_ci#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2060cabdff1aSopenharmony_ci
2061cabdff1aSopenharmony_ci#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
2062cabdff1aSopenharmony_ci{                                                     \
2063cabdff1aSopenharmony_ci    SRARI_H2(RTYPE, in0, in1, shift);                 \
2064cabdff1aSopenharmony_ci    SRARI_H2(RTYPE, in2, in3, shift);                 \
2065cabdff1aSopenharmony_ci}
2066cabdff1aSopenharmony_ci#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2067cabdff1aSopenharmony_ci#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2068cabdff1aSopenharmony_ci
2069cabdff1aSopenharmony_ci/* Description : Shift right arithmetic rounded (immediate)
2070cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, shift
2071cabdff1aSopenharmony_ci                 Outputs - in0, in1     (in place)
2072cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
2073cabdff1aSopenharmony_ci   Details     : Each element of vector 'in0' is shifted right arithmetic by
2074cabdff1aSopenharmony_ci                 value in 'shift'.
2075cabdff1aSopenharmony_ci                 The last discarded bit is added to shifted value for rounding
2076cabdff1aSopenharmony_ci                 and the result is in place written to 'in0'
2077cabdff1aSopenharmony_ci                 Similar for other pairs
2078cabdff1aSopenharmony_ci*/
2079cabdff1aSopenharmony_ci#define SRARI_W2(RTYPE, in0, in1, shift)              \
2080cabdff1aSopenharmony_ci{                                                     \
2081cabdff1aSopenharmony_ci    in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
2082cabdff1aSopenharmony_ci    in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
2083cabdff1aSopenharmony_ci}
2084cabdff1aSopenharmony_ci#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2085cabdff1aSopenharmony_ci
2086cabdff1aSopenharmony_ci#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
2087cabdff1aSopenharmony_ci{                                                   \
2088cabdff1aSopenharmony_ci    SRARI_W2(RTYPE, in0, in1, shift);               \
2089cabdff1aSopenharmony_ci    SRARI_W2(RTYPE, in2, in3, shift);               \
2090cabdff1aSopenharmony_ci}
2091cabdff1aSopenharmony_ci#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2092cabdff1aSopenharmony_ci#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2093cabdff1aSopenharmony_ci
2094cabdff1aSopenharmony_ci/* Description : Multiplication of pairs of vectors
2095cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
2096cabdff1aSopenharmony_ci                 Outputs - out0, out1
2097cabdff1aSopenharmony_ci   Details     : Each element from 'in0' is multiplied with elements from 'in1'
2098cabdff1aSopenharmony_ci                 and result is written to 'out0'
2099cabdff1aSopenharmony_ci                 Similar for other pairs
2100cabdff1aSopenharmony_ci*/
2101cabdff1aSopenharmony_ci#define MUL2(in0, in1, in2, in3, out0, out1)  \
2102cabdff1aSopenharmony_ci{                                             \
2103cabdff1aSopenharmony_ci    out0 = in0 * in1;                         \
2104cabdff1aSopenharmony_ci    out1 = in2 * in3;                         \
2105cabdff1aSopenharmony_ci}
2106cabdff1aSopenharmony_ci#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2107cabdff1aSopenharmony_ci{                                                                             \
2108cabdff1aSopenharmony_ci    MUL2(in0, in1, in2, in3, out0, out1);                                     \
2109cabdff1aSopenharmony_ci    MUL2(in4, in5, in6, in7, out2, out3);                                     \
2110cabdff1aSopenharmony_ci}
2111cabdff1aSopenharmony_ci
2112cabdff1aSopenharmony_ci/* Description : Addition of 2 pairs of vectors
2113cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
2114cabdff1aSopenharmony_ci                 Outputs - out0, out1
2115cabdff1aSopenharmony_ci   Details     : Each element from 2 pairs vectors is added and 2 results are
2116cabdff1aSopenharmony_ci                 produced
2117cabdff1aSopenharmony_ci*/
2118cabdff1aSopenharmony_ci#define ADD2(in0, in1, in2, in3, out0, out1)  \
2119cabdff1aSopenharmony_ci{                                             \
2120cabdff1aSopenharmony_ci    out0 = in0 + in1;                         \
2121cabdff1aSopenharmony_ci    out1 = in2 + in3;                         \
2122cabdff1aSopenharmony_ci}
2123cabdff1aSopenharmony_ci#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2124cabdff1aSopenharmony_ci{                                                                             \
2125cabdff1aSopenharmony_ci    ADD2(in0, in1, in2, in3, out0, out1);                                     \
2126cabdff1aSopenharmony_ci    ADD2(in4, in5, in6, in7, out2, out3);                                     \
2127cabdff1aSopenharmony_ci}
2128cabdff1aSopenharmony_ci
2129cabdff1aSopenharmony_ci/* Description : Subtraction of 2 pairs of vectors
2130cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
2131cabdff1aSopenharmony_ci                 Outputs - out0, out1
2132cabdff1aSopenharmony_ci   Details     : Each element from 2 pairs vectors is subtracted and 2 results
2133cabdff1aSopenharmony_ci                 are produced
2134cabdff1aSopenharmony_ci*/
2135cabdff1aSopenharmony_ci#define SUB2(in0, in1, in2, in3, out0, out1)  \
2136cabdff1aSopenharmony_ci{                                             \
2137cabdff1aSopenharmony_ci    out0 = in0 - in1;                         \
2138cabdff1aSopenharmony_ci    out1 = in2 - in3;                         \
2139cabdff1aSopenharmony_ci}
2140cabdff1aSopenharmony_ci#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2141cabdff1aSopenharmony_ci{                                                                             \
2142cabdff1aSopenharmony_ci    out0 = in0 - in1;                                                         \
2143cabdff1aSopenharmony_ci    out1 = in2 - in3;                                                         \
2144cabdff1aSopenharmony_ci    out2 = in4 - in5;                                                         \
2145cabdff1aSopenharmony_ci    out3 = in6 - in7;                                                         \
2146cabdff1aSopenharmony_ci}
2147cabdff1aSopenharmony_ci
2148cabdff1aSopenharmony_ci/* Description : Sign extend byte elements from right half of the vector
2149cabdff1aSopenharmony_ci   Arguments   : Input  - in    (byte vector)
2150cabdff1aSopenharmony_ci                 Output - out   (sign extended halfword vector)
2151cabdff1aSopenharmony_ci                 Return Type - signed halfword
2152cabdff1aSopenharmony_ci   Details     : Sign bit of byte elements from input vector 'in' is
2153cabdff1aSopenharmony_ci                 extracted and interleaved with same vector 'in' to generate
2154cabdff1aSopenharmony_ci                 8 halfword elements keeping sign intact
2155cabdff1aSopenharmony_ci*/
2156cabdff1aSopenharmony_ci#define UNPCK_R_SB_SH(in, out)                       \
2157cabdff1aSopenharmony_ci{                                                    \
2158cabdff1aSopenharmony_ci    v16i8 sign_m;                                    \
2159cabdff1aSopenharmony_ci                                                     \
2160cabdff1aSopenharmony_ci    sign_m = __msa_clti_s_b((v16i8) in, 0);          \
2161cabdff1aSopenharmony_ci    out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in);  \
2162cabdff1aSopenharmony_ci}
2163cabdff1aSopenharmony_ci
2164cabdff1aSopenharmony_ci/* Description : Sign extend halfword elements from right half of the vector
2165cabdff1aSopenharmony_ci   Arguments   : Inputs  - in    (input halfword vector)
2166cabdff1aSopenharmony_ci                 Outputs - out   (sign extended word vectors)
2167cabdff1aSopenharmony_ci                 Return Type - signed word
2168cabdff1aSopenharmony_ci   Details     : Sign bit of halfword elements from input vector 'in' is
2169cabdff1aSopenharmony_ci                 extracted and interleaved with same vector 'in0' to generate
2170cabdff1aSopenharmony_ci                 4 word elements keeping sign intact
2171cabdff1aSopenharmony_ci*/
2172cabdff1aSopenharmony_ci#define UNPCK_R_SH_SW(in, out)                       \
2173cabdff1aSopenharmony_ci{                                                    \
2174cabdff1aSopenharmony_ci    v8i16 sign_m;                                    \
2175cabdff1aSopenharmony_ci                                                     \
2176cabdff1aSopenharmony_ci    sign_m = __msa_clti_s_h((v8i16) in, 0);          \
2177cabdff1aSopenharmony_ci    out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
2178cabdff1aSopenharmony_ci}
2179cabdff1aSopenharmony_ci
2180cabdff1aSopenharmony_ci/* Description : Sign extend byte elements from input vector and return
2181cabdff1aSopenharmony_ci                 halfword results in pair of vectors
2182cabdff1aSopenharmony_ci   Arguments   : Inputs  - in           (1 input byte vector)
2183cabdff1aSopenharmony_ci                 Outputs - out0, out1   (sign extended 2 halfword vectors)
2184cabdff1aSopenharmony_ci                 Return Type - signed halfword
2185cabdff1aSopenharmony_ci   Details     : Sign bit of byte elements from input vector 'in' is
2186cabdff1aSopenharmony_ci                 extracted and interleaved right with same vector 'in0' to
2187cabdff1aSopenharmony_ci                 generate 8 signed halfword elements in 'out0'
2188cabdff1aSopenharmony_ci                 Then interleaved left with same vector 'in0' to
2189cabdff1aSopenharmony_ci                 generate 8 signed halfword elements in 'out1'
2190cabdff1aSopenharmony_ci*/
2191cabdff1aSopenharmony_ci#define UNPCK_SB_SH(in, out0, out1)                  \
2192cabdff1aSopenharmony_ci{                                                    \
2193cabdff1aSopenharmony_ci    v16i8 tmp_m;                                     \
2194cabdff1aSopenharmony_ci                                                     \
2195cabdff1aSopenharmony_ci    tmp_m = __msa_clti_s_b((v16i8) in, 0);           \
2196cabdff1aSopenharmony_ci    ILVRL_B2_SH(tmp_m, in, out0, out1);              \
2197cabdff1aSopenharmony_ci}
2198cabdff1aSopenharmony_ci
2199cabdff1aSopenharmony_ci/* Description : Zero extend unsigned byte elements to halfword elements
2200cabdff1aSopenharmony_ci   Arguments   : Inputs  - in           (1 input unsigned byte vector)
2201cabdff1aSopenharmony_ci                 Outputs - out0, out1   (unsigned 2 halfword vectors)
2202cabdff1aSopenharmony_ci                 Return Type - signed halfword
2203cabdff1aSopenharmony_ci   Details     : Zero extended right half of vector is returned in 'out0'
2204cabdff1aSopenharmony_ci                 Zero extended left half of vector is returned in 'out1'
2205cabdff1aSopenharmony_ci*/
2206cabdff1aSopenharmony_ci#define UNPCK_UB_SH(in, out0, out1)                   \
2207cabdff1aSopenharmony_ci{                                                     \
2208cabdff1aSopenharmony_ci    v16i8 zero_m = { 0 };                             \
2209cabdff1aSopenharmony_ci                                                      \
2210cabdff1aSopenharmony_ci    ILVRL_B2_SH(zero_m, in, out0, out1);              \
2211cabdff1aSopenharmony_ci}
2212cabdff1aSopenharmony_ci
2213cabdff1aSopenharmony_ci/* Description : Sign extend halfword elements from input vector and return
2214cabdff1aSopenharmony_ci                 result in pair of vectors
2215cabdff1aSopenharmony_ci   Arguments   : Inputs  - in           (1 input halfword vector)
2216cabdff1aSopenharmony_ci                 Outputs - out0, out1   (sign extended 2 word vectors)
2217cabdff1aSopenharmony_ci                 Return Type - signed word
2218cabdff1aSopenharmony_ci   Details     : Sign bit of halfword elements from input vector 'in' is
2219cabdff1aSopenharmony_ci                 extracted and interleaved right with same vector 'in0' to
2220cabdff1aSopenharmony_ci                 generate 4 signed word elements in 'out0'
2221cabdff1aSopenharmony_ci                 Then interleaved left with same vector 'in0' to
2222cabdff1aSopenharmony_ci                 generate 4 signed word elements in 'out1'
2223cabdff1aSopenharmony_ci*/
2224cabdff1aSopenharmony_ci#define UNPCK_SH_SW(in, out0, out1)                  \
2225cabdff1aSopenharmony_ci{                                                    \
2226cabdff1aSopenharmony_ci    v8i16 tmp_m;                                     \
2227cabdff1aSopenharmony_ci                                                     \
2228cabdff1aSopenharmony_ci    tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
2229cabdff1aSopenharmony_ci    ILVRL_H2_SW(tmp_m, in, out0, out1);              \
2230cabdff1aSopenharmony_ci}
2231cabdff1aSopenharmony_ci
2232cabdff1aSopenharmony_ci/* Description : Swap two variables
2233cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1
2234cabdff1aSopenharmony_ci                 Outputs - in0, in1 (in-place)
2235cabdff1aSopenharmony_ci   Details     : Swapping of two input variables using xor
2236cabdff1aSopenharmony_ci*/
2237cabdff1aSopenharmony_ci#define SWAP(in0, in1)  \
2238cabdff1aSopenharmony_ci{                       \
2239cabdff1aSopenharmony_ci    in0 = in0 ^ in1;    \
2240cabdff1aSopenharmony_ci    in1 = in0 ^ in1;    \
2241cabdff1aSopenharmony_ci    in0 = in0 ^ in1;    \
2242cabdff1aSopenharmony_ci}
2243cabdff1aSopenharmony_ci
2244cabdff1aSopenharmony_ci/* Description : Butterfly of 4 input vectors
2245cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
2246cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3
2247cabdff1aSopenharmony_ci   Details     : Butterfly operation
2248cabdff1aSopenharmony_ci*/
2249cabdff1aSopenharmony_ci#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
2250cabdff1aSopenharmony_ci{                                                                \
2251cabdff1aSopenharmony_ci    out0 = in0 + in3;                                            \
2252cabdff1aSopenharmony_ci    out1 = in1 + in2;                                            \
2253cabdff1aSopenharmony_ci                                                                 \
2254cabdff1aSopenharmony_ci    out2 = in1 - in2;                                            \
2255cabdff1aSopenharmony_ci    out3 = in0 - in3;                                            \
2256cabdff1aSopenharmony_ci}
2257cabdff1aSopenharmony_ci
2258cabdff1aSopenharmony_ci/* Description : Butterfly of 8 input vectors
2259cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0 ...  in7
2260cabdff1aSopenharmony_ci                 Outputs - out0 .. out7
2261cabdff1aSopenharmony_ci   Details     : Butterfly operation
2262cabdff1aSopenharmony_ci*/
2263cabdff1aSopenharmony_ci#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \
2264cabdff1aSopenharmony_ci                    out0, out1, out2, out3, out4, out5, out6, out7)  \
2265cabdff1aSopenharmony_ci{                                                                    \
2266cabdff1aSopenharmony_ci    out0 = in0 + in7;                                                \
2267cabdff1aSopenharmony_ci    out1 = in1 + in6;                                                \
2268cabdff1aSopenharmony_ci    out2 = in2 + in5;                                                \
2269cabdff1aSopenharmony_ci    out3 = in3 + in4;                                                \
2270cabdff1aSopenharmony_ci                                                                     \
2271cabdff1aSopenharmony_ci    out4 = in3 - in4;                                                \
2272cabdff1aSopenharmony_ci    out5 = in2 - in5;                                                \
2273cabdff1aSopenharmony_ci    out6 = in1 - in6;                                                \
2274cabdff1aSopenharmony_ci    out7 = in0 - in7;                                                \
2275cabdff1aSopenharmony_ci}
2276cabdff1aSopenharmony_ci
2277cabdff1aSopenharmony_ci/* Description : Butterfly of 16 input vectors
2278cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0 ...  in15
2279cabdff1aSopenharmony_ci                 Outputs - out0 .. out15
2280cabdff1aSopenharmony_ci   Details     : Butterfly operation
2281cabdff1aSopenharmony_ci*/
2282cabdff1aSopenharmony_ci#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                \
2283cabdff1aSopenharmony_ci                     in8, in9,  in10, in11, in12, in13, in14, in15,         \
2284cabdff1aSopenharmony_ci                     out0, out1, out2, out3, out4, out5, out6, out7,        \
2285cabdff1aSopenharmony_ci                     out8, out9, out10, out11, out12, out13, out14, out15)  \
2286cabdff1aSopenharmony_ci{                                                                           \
2287cabdff1aSopenharmony_ci    out0 = in0 + in15;                                                      \
2288cabdff1aSopenharmony_ci    out1 = in1 + in14;                                                      \
2289cabdff1aSopenharmony_ci    out2 = in2 + in13;                                                      \
2290cabdff1aSopenharmony_ci    out3 = in3 + in12;                                                      \
2291cabdff1aSopenharmony_ci    out4 = in4 + in11;                                                      \
2292cabdff1aSopenharmony_ci    out5 = in5 + in10;                                                      \
2293cabdff1aSopenharmony_ci    out6 = in6 + in9;                                                       \
2294cabdff1aSopenharmony_ci    out7 = in7 + in8;                                                       \
2295cabdff1aSopenharmony_ci                                                                            \
2296cabdff1aSopenharmony_ci    out8 = in7 - in8;                                                       \
2297cabdff1aSopenharmony_ci    out9 = in6 - in9;                                                       \
2298cabdff1aSopenharmony_ci    out10 = in5 - in10;                                                     \
2299cabdff1aSopenharmony_ci    out11 = in4 - in11;                                                     \
2300cabdff1aSopenharmony_ci    out12 = in3 - in12;                                                     \
2301cabdff1aSopenharmony_ci    out13 = in2 - in13;                                                     \
2302cabdff1aSopenharmony_ci    out14 = in1 - in14;                                                     \
2303cabdff1aSopenharmony_ci    out15 = in0 - in15;                                                     \
2304cabdff1aSopenharmony_ci}
2305cabdff1aSopenharmony_ci
2306cabdff1aSopenharmony_ci/* Description : Transposes input 4x4 byte block
2307cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
2308cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3  (output 4x4 byte block)
2309cabdff1aSopenharmony_ci                 Return Type - unsigned byte
2310cabdff1aSopenharmony_ci   Details     :
2311cabdff1aSopenharmony_ci*/
2312cabdff1aSopenharmony_ci#define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
2313cabdff1aSopenharmony_ci{                                                                       \
2314cabdff1aSopenharmony_ci    v16i8 zero_m = { 0 };                                               \
2315cabdff1aSopenharmony_ci    v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
2316cabdff1aSopenharmony_ci                                                                        \
2317cabdff1aSopenharmony_ci    ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
2318cabdff1aSopenharmony_ci    ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
2319cabdff1aSopenharmony_ci                                                                        \
2320cabdff1aSopenharmony_ci    out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
2321cabdff1aSopenharmony_ci    out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
2322cabdff1aSopenharmony_ci    out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
2323cabdff1aSopenharmony_ci    out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
2324cabdff1aSopenharmony_ci}
2325cabdff1aSopenharmony_ci
2326cabdff1aSopenharmony_ci/* Description : Transposes input 8x4 byte block into 4x8
2327cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
2328cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3  (output 4x8 byte block)
2329cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
2330cabdff1aSopenharmony_ci   Details     :
2331cabdff1aSopenharmony_ci*/
2332cabdff1aSopenharmony_ci#define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2333cabdff1aSopenharmony_ci                        out0, out1, out2, out3)                         \
2334cabdff1aSopenharmony_ci{                                                                       \
2335cabdff1aSopenharmony_ci    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2336cabdff1aSopenharmony_ci                                                                        \
2337cabdff1aSopenharmony_ci    ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
2338cabdff1aSopenharmony_ci    tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2339cabdff1aSopenharmony_ci    ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
2340cabdff1aSopenharmony_ci                                                                        \
2341cabdff1aSopenharmony_ci    tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2342cabdff1aSopenharmony_ci    ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
2343cabdff1aSopenharmony_ci                                                                        \
2344cabdff1aSopenharmony_ci    ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
2345cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
2346cabdff1aSopenharmony_ci    out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2347cabdff1aSopenharmony_ci}
2348cabdff1aSopenharmony_ci#define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2349cabdff1aSopenharmony_ci#define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2350cabdff1aSopenharmony_ci
2351cabdff1aSopenharmony_ci/* Description : Transposes input 8x8 byte block
2352cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2353cabdff1aSopenharmony_ci                           (input 8x8 byte block)
2354cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2355cabdff1aSopenharmony_ci                           (output 8x8 byte block)
2356cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
2357cabdff1aSopenharmony_ci   Details     :
2358cabdff1aSopenharmony_ci*/
2359cabdff1aSopenharmony_ci#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2360cabdff1aSopenharmony_ci                        out0, out1, out2, out3, out4, out5, out6, out7)  \
2361cabdff1aSopenharmony_ci{                                                                        \
2362cabdff1aSopenharmony_ci    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
2363cabdff1aSopenharmony_ci    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
2364cabdff1aSopenharmony_ci    v16i8 zeros = { 0 };                                                 \
2365cabdff1aSopenharmony_ci                                                                         \
2366cabdff1aSopenharmony_ci    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
2367cabdff1aSopenharmony_ci               tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
2368cabdff1aSopenharmony_ci    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
2369cabdff1aSopenharmony_ci    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
2370cabdff1aSopenharmony_ci    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
2371cabdff1aSopenharmony_ci    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
2372cabdff1aSopenharmony_ci    SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6,   \
2373cabdff1aSopenharmony_ci            8, out1, out3, out5, out7);                                  \
2374cabdff1aSopenharmony_ci}
2375cabdff1aSopenharmony_ci#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2376cabdff1aSopenharmony_ci#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2377cabdff1aSopenharmony_ci
2378cabdff1aSopenharmony_ci/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2379cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2380cabdff1aSopenharmony_ci                           in8, in9, in10, in11, in12, in13, in14, in15
2381cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3
2382cabdff1aSopenharmony_ci                 Return Type - unsigned byte
2383cabdff1aSopenharmony_ci   Details     :
2384cabdff1aSopenharmony_ci*/
2385cabdff1aSopenharmony_ci#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
2386cabdff1aSopenharmony_ci                            in8, in9, in10, in11, in12, in13, in14, in15,  \
2387cabdff1aSopenharmony_ci                            out0, out1, out2, out3)                        \
2388cabdff1aSopenharmony_ci{                                                                          \
2389cabdff1aSopenharmony_ci    v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
2390cabdff1aSopenharmony_ci                                                                           \
2391cabdff1aSopenharmony_ci    ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
2392cabdff1aSopenharmony_ci    out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2393cabdff1aSopenharmony_ci                                                                           \
2394cabdff1aSopenharmony_ci    ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
2395cabdff1aSopenharmony_ci    out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2396cabdff1aSopenharmony_ci                                                                           \
2397cabdff1aSopenharmony_ci    ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
2398cabdff1aSopenharmony_ci                                                                           \
2399cabdff1aSopenharmony_ci    tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2400cabdff1aSopenharmony_ci    ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
2401cabdff1aSopenharmony_ci                                                                           \
2402cabdff1aSopenharmony_ci    tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2403cabdff1aSopenharmony_ci    ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
2404cabdff1aSopenharmony_ci    out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2405cabdff1aSopenharmony_ci    out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2406cabdff1aSopenharmony_ci                                                                           \
2407cabdff1aSopenharmony_ci    tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \
2408cabdff1aSopenharmony_ci    tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \
2409cabdff1aSopenharmony_ci    out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2410cabdff1aSopenharmony_ci    out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2411cabdff1aSopenharmony_ci}
2412cabdff1aSopenharmony_ci
2413cabdff1aSopenharmony_ci/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2414cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2415cabdff1aSopenharmony_ci                           in8, in9, in10, in11, in12, in13, in14, in15
2416cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2417cabdff1aSopenharmony_ci                 Return Type - unsigned byte
2418cabdff1aSopenharmony_ci   Details     :
2419cabdff1aSopenharmony_ci*/
2420cabdff1aSopenharmony_ci#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
2421cabdff1aSopenharmony_ci                            in8, in9, in10, in11, in12, in13, in14, in15,    \
2422cabdff1aSopenharmony_ci                            out0, out1, out2, out3, out4, out5, out6, out7)  \
2423cabdff1aSopenharmony_ci{                                                                            \
2424cabdff1aSopenharmony_ci    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2425cabdff1aSopenharmony_ci    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
2426cabdff1aSopenharmony_ci                                                                             \
2427cabdff1aSopenharmony_ci    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
2428cabdff1aSopenharmony_ci    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
2429cabdff1aSopenharmony_ci    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
2430cabdff1aSopenharmony_ci    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
2431cabdff1aSopenharmony_ci                                                                             \
2432cabdff1aSopenharmony_ci    tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
2433cabdff1aSopenharmony_ci    tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
2434cabdff1aSopenharmony_ci    tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
2435cabdff1aSopenharmony_ci    tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
2436cabdff1aSopenharmony_ci    out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
2437cabdff1aSopenharmony_ci    tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
2438cabdff1aSopenharmony_ci    out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
2439cabdff1aSopenharmony_ci    tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
2440cabdff1aSopenharmony_ci                                                                             \
2441cabdff1aSopenharmony_ci    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
2442cabdff1aSopenharmony_ci    out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2443cabdff1aSopenharmony_ci    out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2444cabdff1aSopenharmony_ci                                                                             \
2445cabdff1aSopenharmony_ci    tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2446cabdff1aSopenharmony_ci    tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
2447cabdff1aSopenharmony_ci    out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2448cabdff1aSopenharmony_ci    out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2449cabdff1aSopenharmony_ci                                                                             \
2450cabdff1aSopenharmony_ci    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
2451cabdff1aSopenharmony_ci    out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2452cabdff1aSopenharmony_ci    out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2453cabdff1aSopenharmony_ci                                                                             \
2454cabdff1aSopenharmony_ci    tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2455cabdff1aSopenharmony_ci    tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2456cabdff1aSopenharmony_ci    out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2457cabdff1aSopenharmony_ci    out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2458cabdff1aSopenharmony_ci}
2459cabdff1aSopenharmony_ci
2460cabdff1aSopenharmony_ci/* Description : Transposes 4x4 block with half word elements in vectors
2461cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
2462cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3
2463cabdff1aSopenharmony_ci                 Return Type - signed halfword
2464cabdff1aSopenharmony_ci   Details     :
2465cabdff1aSopenharmony_ci*/
2466cabdff1aSopenharmony_ci#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
2467cabdff1aSopenharmony_ci{                                                                       \
2468cabdff1aSopenharmony_ci    v8i16 s0_m, s1_m;                                                   \
2469cabdff1aSopenharmony_ci                                                                        \
2470cabdff1aSopenharmony_ci    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
2471cabdff1aSopenharmony_ci    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
2472cabdff1aSopenharmony_ci    out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \
2473cabdff1aSopenharmony_ci    out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2474cabdff1aSopenharmony_ci}
2475cabdff1aSopenharmony_ci
2476cabdff1aSopenharmony_ci/* Description : Transposes 8x8 block with half word elements in vectors
2477cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2478cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2479cabdff1aSopenharmony_ci                 Return Type - as per RTYPE
2480cabdff1aSopenharmony_ci   Details     :
2481cabdff1aSopenharmony_ci*/
2482cabdff1aSopenharmony_ci#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2483cabdff1aSopenharmony_ci                       out0, out1, out2, out3, out4, out5, out6, out7)  \
2484cabdff1aSopenharmony_ci{                                                                       \
2485cabdff1aSopenharmony_ci    v8i16 s0_m, s1_m;                                                   \
2486cabdff1aSopenharmony_ci    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2487cabdff1aSopenharmony_ci    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                               \
2488cabdff1aSopenharmony_ci                                                                        \
2489cabdff1aSopenharmony_ci    ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2490cabdff1aSopenharmony_ci    ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                            \
2491cabdff1aSopenharmony_ci    ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2492cabdff1aSopenharmony_ci    ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                            \
2493cabdff1aSopenharmony_ci    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2494cabdff1aSopenharmony_ci    ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                            \
2495cabdff1aSopenharmony_ci    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2496cabdff1aSopenharmony_ci    ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                            \
2497cabdff1aSopenharmony_ci    PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,     \
2498cabdff1aSopenharmony_ci             tmp3_m, tmp7_m, out0, out2, out4, out6);                   \
2499cabdff1aSopenharmony_ci    out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m);       \
2500cabdff1aSopenharmony_ci    out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m);       \
2501cabdff1aSopenharmony_ci    out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m);       \
2502cabdff1aSopenharmony_ci    out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m);       \
2503cabdff1aSopenharmony_ci}
2504cabdff1aSopenharmony_ci#define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2505cabdff1aSopenharmony_ci#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2506cabdff1aSopenharmony_ci
2507cabdff1aSopenharmony_ci/* Description : Transposes 4x4 block with word elements in vectors
2508cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3
2509cabdff1aSopenharmony_ci                 Outputs - out0, out1, out2, out3
2510cabdff1aSopenharmony_ci                 Return Type - signed word
2511cabdff1aSopenharmony_ci   Details     :
2512cabdff1aSopenharmony_ci*/
2513cabdff1aSopenharmony_ci#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
2514cabdff1aSopenharmony_ci{                                                                       \
2515cabdff1aSopenharmony_ci    v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
2516cabdff1aSopenharmony_ci                                                                        \
2517cabdff1aSopenharmony_ci    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
2518cabdff1aSopenharmony_ci    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
2519cabdff1aSopenharmony_ci                                                                        \
2520cabdff1aSopenharmony_ci    out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);            \
2521cabdff1aSopenharmony_ci    out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);            \
2522cabdff1aSopenharmony_ci    out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);            \
2523cabdff1aSopenharmony_ci    out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
2524cabdff1aSopenharmony_ci}
2525cabdff1aSopenharmony_ci
2526cabdff1aSopenharmony_ci/* Description : Average byte elements from pair of vectors and store 8x4 byte
2527cabdff1aSopenharmony_ci                 block in destination memory
2528cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2529cabdff1aSopenharmony_ci   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2530cabdff1aSopenharmony_ci                 averaged (a + b)/2 and stored in 'tmp0_m'
2531cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in2' and 'in3' are
2532cabdff1aSopenharmony_ci                 averaged (a + b)/2 and stored in 'tmp1_m'
2533cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in4' and 'in5' are
2534cabdff1aSopenharmony_ci                 averaged (a + b)/2 and stored in 'tmp2_m'
2535cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in6' and 'in7' are
2536cabdff1aSopenharmony_ci                 averaged (a + b)/2 and stored in 'tmp3_m'
2537cabdff1aSopenharmony_ci                 The half vector results from all 4 vectors are stored in
2538cabdff1aSopenharmony_ci                 destination memory as 8x4 byte block
2539cabdff1aSopenharmony_ci*/
2540cabdff1aSopenharmony_ci#define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2541cabdff1aSopenharmony_ci{                                                                           \
2542cabdff1aSopenharmony_ci    uint64_t out0_m, out1_m, out2_m, out3_m;                                \
2543cabdff1aSopenharmony_ci    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2544cabdff1aSopenharmony_ci                                                                            \
2545cabdff1aSopenharmony_ci    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
2546cabdff1aSopenharmony_ci    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
2547cabdff1aSopenharmony_ci    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
2548cabdff1aSopenharmony_ci    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
2549cabdff1aSopenharmony_ci                                                                            \
2550cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
2551cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
2552cabdff1aSopenharmony_ci    out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
2553cabdff1aSopenharmony_ci    out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
2554cabdff1aSopenharmony_ci    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
2555cabdff1aSopenharmony_ci}
2556cabdff1aSopenharmony_ci
2557cabdff1aSopenharmony_ci/* Description : Average byte elements from pair of vectors and store 16x4 byte
2558cabdff1aSopenharmony_ci                 block in destination memory
2559cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2560cabdff1aSopenharmony_ci   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2561cabdff1aSopenharmony_ci                 averaged (a + b)/2 and stored in 'tmp0_m'
2562cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in2' and 'in3' are
2563cabdff1aSopenharmony_ci                 averaged (a + b)/2 and stored in 'tmp1_m'
2564cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in4' and 'in5' are
2565cabdff1aSopenharmony_ci                 averaged (a + b)/2 and stored in 'tmp2_m'
2566cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in6' and 'in7' are
2567cabdff1aSopenharmony_ci                 averaged (a + b)/2 and stored in 'tmp3_m'
2568cabdff1aSopenharmony_ci                 The results from all 4 vectors are stored in destination
2569cabdff1aSopenharmony_ci                 memory as 16x4 byte block
2570cabdff1aSopenharmony_ci*/
2571cabdff1aSopenharmony_ci#define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2572cabdff1aSopenharmony_ci{                                                                            \
2573cabdff1aSopenharmony_ci    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2574cabdff1aSopenharmony_ci                                                                             \
2575cabdff1aSopenharmony_ci    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
2576cabdff1aSopenharmony_ci    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
2577cabdff1aSopenharmony_ci    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
2578cabdff1aSopenharmony_ci    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
2579cabdff1aSopenharmony_ci                                                                             \
2580cabdff1aSopenharmony_ci    ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
2581cabdff1aSopenharmony_ci}
2582cabdff1aSopenharmony_ci
2583cabdff1aSopenharmony_ci/* Description : Average rounded byte elements from pair of vectors and store
2584cabdff1aSopenharmony_ci                 8x4 byte block in destination memory
2585cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2586cabdff1aSopenharmony_ci   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2587cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2588cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in2' and 'in3' are
2589cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2590cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in4' and 'in5' are
2591cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2592cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in6' and 'in7' are
2593cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2594cabdff1aSopenharmony_ci                 The half vector results from all 4 vectors are stored in
2595cabdff1aSopenharmony_ci                 destination memory as 8x4 byte block
2596cabdff1aSopenharmony_ci*/
2597cabdff1aSopenharmony_ci#define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2598cabdff1aSopenharmony_ci{                                                                            \
2599cabdff1aSopenharmony_ci    uint64_t out0_m, out1_m, out2_m, out3_m;                                 \
2600cabdff1aSopenharmony_ci    v16u8 tp0_m, tp1_m, tp2_m, tp3_m;                                        \
2601cabdff1aSopenharmony_ci                                                                             \
2602cabdff1aSopenharmony_ci    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                      \
2603cabdff1aSopenharmony_ci                tp0_m, tp1_m, tp2_m, tp3_m);                                 \
2604cabdff1aSopenharmony_ci                                                                             \
2605cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_d((v2i64) tp0_m, 0);                               \
2606cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_d((v2i64) tp1_m, 0);                               \
2607cabdff1aSopenharmony_ci    out2_m = __msa_copy_u_d((v2i64) tp2_m, 0);                               \
2608cabdff1aSopenharmony_ci    out3_m = __msa_copy_u_d((v2i64) tp3_m, 0);                               \
2609cabdff1aSopenharmony_ci    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
2610cabdff1aSopenharmony_ci}
2611cabdff1aSopenharmony_ci
2612cabdff1aSopenharmony_ci/* Description : Average rounded byte elements from pair of vectors and store
2613cabdff1aSopenharmony_ci                 16x4 byte block in destination memory
2614cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2615cabdff1aSopenharmony_ci   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2616cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2617cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in2' and 'in3' are
2618cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2619cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in4' and 'in5' are
2620cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2621cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in6' and 'in7' are
2622cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2623cabdff1aSopenharmony_ci                 The vector results from all 4 vectors are stored in
2624cabdff1aSopenharmony_ci                 destination memory as 16x4 byte block
2625cabdff1aSopenharmony_ci*/
2626cabdff1aSopenharmony_ci#define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2627cabdff1aSopenharmony_ci{                                                                             \
2628cabdff1aSopenharmony_ci    v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
2629cabdff1aSopenharmony_ci                                                                              \
2630cabdff1aSopenharmony_ci    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
2631cabdff1aSopenharmony_ci                t0_m, t1_m, t2_m, t3_m);                                      \
2632cabdff1aSopenharmony_ci    ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
2633cabdff1aSopenharmony_ci}
2634cabdff1aSopenharmony_ci
2635cabdff1aSopenharmony_ci/* Description : Average rounded byte elements from pair of vectors,
2636cabdff1aSopenharmony_ci                 average rounded with destination and store 8x4 byte block
2637cabdff1aSopenharmony_ci                 in destination memory
2638cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2639cabdff1aSopenharmony_ci   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2640cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2641cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in2' and 'in3' are
2642cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2643cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in4' and 'in5' are
2644cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2645cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in6' and 'in7' are
2646cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2647cabdff1aSopenharmony_ci                 The half vector results from all 4 vectors are stored in
2648cabdff1aSopenharmony_ci                 destination memory as 8x4 byte block
2649cabdff1aSopenharmony_ci*/
2650cabdff1aSopenharmony_ci#define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2651cabdff1aSopenharmony_ci                          pdst, stride)                            \
2652cabdff1aSopenharmony_ci{                                                                  \
2653cabdff1aSopenharmony_ci    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
2654cabdff1aSopenharmony_ci    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
2655cabdff1aSopenharmony_ci                                                                   \
2656cabdff1aSopenharmony_ci    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
2657cabdff1aSopenharmony_ci    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
2658cabdff1aSopenharmony_ci                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
2659cabdff1aSopenharmony_ci    AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2660cabdff1aSopenharmony_ci                  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2661cabdff1aSopenharmony_ci}
2662cabdff1aSopenharmony_ci
2663cabdff1aSopenharmony_ci/* Description : Average rounded byte elements from pair of vectors,
2664cabdff1aSopenharmony_ci                 average rounded with destination and store 16x4 byte block
2665cabdff1aSopenharmony_ci                 in destination memory
2666cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2667cabdff1aSopenharmony_ci   Details     : Each byte element from input vector pair 'in0' and 'in1' are
2668cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2669cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in2' and 'in3' are
2670cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2671cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in4' and 'in5' are
2672cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2673cabdff1aSopenharmony_ci                 Each byte element from input vector pair 'in6' and 'in7' are
2674cabdff1aSopenharmony_ci                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2675cabdff1aSopenharmony_ci                 The vector results from all 4 vectors are stored in
2676cabdff1aSopenharmony_ci                 destination memory as 16x4 byte block
2677cabdff1aSopenharmony_ci*/
2678cabdff1aSopenharmony_ci#define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2679cabdff1aSopenharmony_ci                           pdst, stride)                            \
2680cabdff1aSopenharmony_ci{                                                                   \
2681cabdff1aSopenharmony_ci    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
2682cabdff1aSopenharmony_ci    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
2683cabdff1aSopenharmony_ci                                                                    \
2684cabdff1aSopenharmony_ci    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
2685cabdff1aSopenharmony_ci    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
2686cabdff1aSopenharmony_ci                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
2687cabdff1aSopenharmony_ci    AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2688cabdff1aSopenharmony_ci                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2689cabdff1aSopenharmony_ci}
2690cabdff1aSopenharmony_ci
2691cabdff1aSopenharmony_ci/* Description : Add block 4x4
2692cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2693cabdff1aSopenharmony_ci   Details     : Least significant 4 bytes from each input vector are added to
2694cabdff1aSopenharmony_ci                 the destination bytes, clipped between 0-255 and then stored.
2695cabdff1aSopenharmony_ci*/
2696cabdff1aSopenharmony_ci#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \
2697cabdff1aSopenharmony_ci{                                                                 \
2698cabdff1aSopenharmony_ci    uint32_t src0_m, src1_m, src2_m, src3_m;                      \
2699cabdff1aSopenharmony_ci    uint32_t out0_m, out1_m, out2_m, out3_m;                      \
2700cabdff1aSopenharmony_ci    v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
2701cabdff1aSopenharmony_ci    v16i8 dst0_m = { 0 };                                         \
2702cabdff1aSopenharmony_ci    v16i8 dst1_m = { 0 };                                         \
2703cabdff1aSopenharmony_ci    v16i8 zero_m = { 0 };                                         \
2704cabdff1aSopenharmony_ci                                                                  \
2705cabdff1aSopenharmony_ci    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
2706cabdff1aSopenharmony_ci    LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
2707cabdff1aSopenharmony_ci    INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
2708cabdff1aSopenharmony_ci    INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
2709cabdff1aSopenharmony_ci    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
2710cabdff1aSopenharmony_ci    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
2711cabdff1aSopenharmony_ci    CLIP_SH2_0_255(res0_m, res1_m);                               \
2712cabdff1aSopenharmony_ci    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
2713cabdff1aSopenharmony_ci                                                                  \
2714cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \
2715cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \
2716cabdff1aSopenharmony_ci    out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \
2717cabdff1aSopenharmony_ci    out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \
2718cabdff1aSopenharmony_ci    SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
2719cabdff1aSopenharmony_ci}
2720cabdff1aSopenharmony_ci
2721cabdff1aSopenharmony_ci/* Description : Dot product and addition of 3 signed halfword input vectors
2722cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
2723cabdff1aSopenharmony_ci                 Outputs - out0_m
2724cabdff1aSopenharmony_ci                 Return Type - signed halfword
2725cabdff1aSopenharmony_ci   Details     : Dot product of 'in0' with 'coeff0'
2726cabdff1aSopenharmony_ci                 Dot product of 'in1' with 'coeff1'
2727cabdff1aSopenharmony_ci                 Dot product of 'in2' with 'coeff2'
2728cabdff1aSopenharmony_ci                 Addition of all the 3 vector results
2729cabdff1aSopenharmony_ci
2730cabdff1aSopenharmony_ci                 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2731cabdff1aSopenharmony_ci*/
2732cabdff1aSopenharmony_ci#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
2733cabdff1aSopenharmony_ci( {                                                                 \
2734cabdff1aSopenharmony_ci    v8i16 out0_m;                                                   \
2735cabdff1aSopenharmony_ci                                                                    \
2736cabdff1aSopenharmony_ci    out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
2737cabdff1aSopenharmony_ci    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
2738cabdff1aSopenharmony_ci    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
2739cabdff1aSopenharmony_ci                                                                    \
2740cabdff1aSopenharmony_ci    out0_m;                                                         \
2741cabdff1aSopenharmony_ci} )
2742cabdff1aSopenharmony_ci
2743cabdff1aSopenharmony_ci/* Description : Pack even elements of input vectors & xor with 128
2744cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1
2745cabdff1aSopenharmony_ci                 Outputs - out_m
2746cabdff1aSopenharmony_ci                 Return Type - unsigned byte
2747cabdff1aSopenharmony_ci   Details     : Signed byte even elements from 'in0' and 'in1' are packed
2748cabdff1aSopenharmony_ci                 together in one vector and the resulted vector is xor'ed with
2749cabdff1aSopenharmony_ci                 128 to shift the range from signed to unsigned byte
2750cabdff1aSopenharmony_ci*/
2751cabdff1aSopenharmony_ci#define PCKEV_XORI128_UB(in0, in1)                            \
2752cabdff1aSopenharmony_ci( {                                                           \
2753cabdff1aSopenharmony_ci    v16u8 out_m;                                              \
2754cabdff1aSopenharmony_ci    out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2755cabdff1aSopenharmony_ci    out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
2756cabdff1aSopenharmony_ci    out_m;                                                    \
2757cabdff1aSopenharmony_ci} )
2758cabdff1aSopenharmony_ci
2759cabdff1aSopenharmony_ci/* Description : Converts inputs to unsigned bytes, interleave, average & store
2760cabdff1aSopenharmony_ci                 as 8x4 unsigned byte block
2761cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
2762cabdff1aSopenharmony_ci*/
2763cabdff1aSopenharmony_ci#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,           \
2764cabdff1aSopenharmony_ci                                dst0, dst1, pdst, stride)     \
2765cabdff1aSopenharmony_ci{                                                             \
2766cabdff1aSopenharmony_ci    v16u8 tmp0_m, tmp1_m;                                     \
2767cabdff1aSopenharmony_ci    uint8_t *pdst_m = (uint8_t *) (pdst);                     \
2768cabdff1aSopenharmony_ci                                                              \
2769cabdff1aSopenharmony_ci    tmp0_m = PCKEV_XORI128_UB(in0, in1);                      \
2770cabdff1aSopenharmony_ci    tmp1_m = PCKEV_XORI128_UB(in2, in3);                      \
2771cabdff1aSopenharmony_ci    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
2772cabdff1aSopenharmony_ci    ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);        \
2773cabdff1aSopenharmony_ci}
2774cabdff1aSopenharmony_ci
2775cabdff1aSopenharmony_ci/* Description : Pack even byte elements, extract 0 & 2 index words from pair
2776cabdff1aSopenharmony_ci                 of results and store 4 words in destination memory as per
2777cabdff1aSopenharmony_ci                 stride
2778cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2779cabdff1aSopenharmony_ci*/
2780cabdff1aSopenharmony_ci#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
2781cabdff1aSopenharmony_ci{                                                         \
2782cabdff1aSopenharmony_ci    uint32_t out0_m, out1_m, out2_m, out3_m;              \
2783cabdff1aSopenharmony_ci    v16i8 tmp0_m, tmp1_m;                                 \
2784cabdff1aSopenharmony_ci                                                          \
2785cabdff1aSopenharmony_ci    PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
2786cabdff1aSopenharmony_ci                                                          \
2787cabdff1aSopenharmony_ci    out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
2788cabdff1aSopenharmony_ci    out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
2789cabdff1aSopenharmony_ci    out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
2790cabdff1aSopenharmony_ci    out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
2791cabdff1aSopenharmony_ci                                                          \
2792cabdff1aSopenharmony_ci    SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
2793cabdff1aSopenharmony_ci}
2794cabdff1aSopenharmony_ci
2795cabdff1aSopenharmony_ci/* Description : Pack even byte elements and store byte vector in destination
2796cabdff1aSopenharmony_ci                 memory
2797cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, pdst
2798cabdff1aSopenharmony_ci*/
2799cabdff1aSopenharmony_ci#define PCKEV_ST_SB(in0, in1, pdst)                   \
2800cabdff1aSopenharmony_ci{                                                     \
2801cabdff1aSopenharmony_ci    v16i8 tmp_m;                                      \
2802cabdff1aSopenharmony_ci    tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2803cabdff1aSopenharmony_ci    ST_SB(tmp_m, (pdst));                             \
2804cabdff1aSopenharmony_ci}
2805cabdff1aSopenharmony_ci
2806cabdff1aSopenharmony_ci/* Description : Horizontal 2 tap filter kernel code
2807cabdff1aSopenharmony_ci   Arguments   : Inputs  - in0, in1, mask, coeff, shift
2808cabdff1aSopenharmony_ci*/
2809cabdff1aSopenharmony_ci#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)            \
2810cabdff1aSopenharmony_ci( {                                                                 \
2811cabdff1aSopenharmony_ci    v16i8 tmp0_m;                                                   \
2812cabdff1aSopenharmony_ci    v8u16 tmp1_m;                                                   \
2813cabdff1aSopenharmony_ci                                                                    \
2814cabdff1aSopenharmony_ci    tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0);  \
2815cabdff1aSopenharmony_ci    tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff);         \
2816cabdff1aSopenharmony_ci    tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift);          \
2817cabdff1aSopenharmony_ci    tmp1_m = __msa_sat_u_h(tmp1_m, shift);                          \
2818cabdff1aSopenharmony_ci                                                                    \
2819cabdff1aSopenharmony_ci    tmp1_m;                                                         \
2820cabdff1aSopenharmony_ci} )
2821cabdff1aSopenharmony_ci#endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */
2822