1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * All rights reserved.
4  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
5  *                Xiwei Gu   <guxiwei-hf@loongson.cn>
6  *                Lu Wang    <wanglu@loongson.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  *
24  */
25 
26 #ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
27 #define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
28 
29 /*
30  * Copyright (c) 2021 Loongson Technology Corporation Limited
31  * All rights reserved.
32  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
33  *                Xiwei Gu   <guxiwei-hf@loongson.cn>
34  *                Lu Wang    <wanglu@loongson.cn>
35  *
36  * This file is a header file for loongarch builtin extension.
37  *
38  */
39 
40 #ifndef LOONGSON_INTRINSICS_H
41 #define LOONGSON_INTRINSICS_H
42 
43 /**
44  * MAJOR version: Macro usage changes.
45  * MINOR version: Add new functions, or bug fixes.
46  * MICRO version: Comment changes or implementation changes.
47  */
48 #define LSOM_VERSION_MAJOR 1
49 #define LSOM_VERSION_MINOR 1
50 #define LSOM_VERSION_MICRO 0
51 
52 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
53   {                                               \
54     _OUT0 = _INS(_IN0);                           \
55     _OUT1 = _INS(_IN1);                           \
56   }
57 
58 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
59   {                                                           \
60     _OUT0 = _INS(_IN0, _IN1);                                 \
61     _OUT1 = _INS(_IN2, _IN3);                                 \
62   }
63 
64 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
65   {                                                                       \
66     _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
67     _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
68   }
69 
70 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
71   {                                                                         \
72     DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
73     DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
74   }
75 
76 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
77                   _OUT1, _OUT2, _OUT3)                                         \
78   {                                                                            \
79     DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
80     DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
81   }
82 
83 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
84                   _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
85   {                                                                           \
86     DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
87     DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
88   }
89 
90 #ifdef __loongarch_sx
91 #include <lsxintrin.h>
92 /*
93  * =============================================================================
94  * Description : Dot product & addition of byte vector elements
95  * Arguments   : Inputs  - in_c, in_h, in_l
96  *               Outputs - out
97  *               Return Type - halfword
98  * Details     : Signed byte elements from in_h are multiplied by
99  *               signed byte elements from in_l, and then added adjacent to
100  *               each other to get results with the twice size of input.
101  *               Then the results plus to signed half-word elements from in_c.
102  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
103  *        in_c : 1,2,3,4, 1,2,3,4
104  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
105  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
106  *         out : 23,40,41,26, 23,40,41,26
107  * =============================================================================
108  */
__lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)109 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
110                                         __m128i in_l) {
111   __m128i out;
112 
113   out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
114   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
115   return out;
116 }
117 
118 /*
119  * =============================================================================
120  * Description : Dot product & addition of byte vector elements
121  * Arguments   : Inputs  - in_c, in_h, in_l
122  *               Outputs - out
123  *               Return Type - halfword
124  * Details     : Unsigned byte elements from in_h are multiplied by
125  *               unsigned byte elements from in_l, and then added adjacent to
126  *               each other to get results with the twice size of input.
127  *               The results plus to signed half-word elements from in_c.
128  * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
129  *        in_c : 1,2,3,4, 1,2,3,4
130  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
131  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
132  *         out : 23,40,41,26, 23,40,41,26
133  * =============================================================================
134  */
__lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)135 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
136                                          __m128i in_l) {
137   __m128i out;
138 
139   out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
140   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
141   return out;
142 }
143 
144 /*
145  * =============================================================================
146  * Description : Dot product & addition of byte vector elements
147  * Arguments   : Inputs  - in_c, in_h, in_l
148  *               Outputs - out
149  *               Return Type - halfword
150  * Details     : Unsigned byte elements from in_h are multiplied by
151  *               signed byte elements from in_l, and then added adjacent to
152  *               each other to get results with the twice size of input.
153  *               The results plus to signed half-word elements from in_c.
154  * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
155  *        in_c : 1,1,1,1, 1,1,1,1
156  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
157  *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
158  *         out : -4,-24,-60,-112, 6,26,62,114
159  * =============================================================================
160  */
__lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, __m128i in_l)161 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
162                                            __m128i in_l) {
163   __m128i out;
164 
165   out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
166   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
167   return out;
168 }
169 
170 /*
171  * =============================================================================
172  * Description : Dot product & addition of half-word vector elements
173  * Arguments   : Inputs  - in_c, in_h, in_l
174  *               Outputs - out
175  *               Return Type - __m128i
176  * Details     : Signed half-word elements from in_h are multiplied by
177  *               signed half-word elements from in_l, and then added adjacent to
178  *               each other to get results with the twice size of input.
179  *               Then the results plus to signed word elements from in_c.
180  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
181  *        in_c : 1,2,3,4
182  *        in_h : 1,2,3,4, 5,6,7,8
183  *        in_l : 8,7,6,5, 4,3,2,1
184  *         out : 23,40,41,26
185  * =============================================================================
186  */
__lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)187 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
188                                         __m128i in_l) {
189   __m128i out;
190 
191   out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
192   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
193   return out;
194 }
195 
196 /*
197  * =============================================================================
198  * Description : Dot product of byte vector elements
199  * Arguments   : Inputs  - in_h, in_l
200  *               Outputs - out
201  *               Return Type - halfword
202  * Details     : Signed byte elements from in_h are multiplied by
203  *               signed byte elements from in_l, and then added adjacent to
204  *               each other to get results with the twice size of input.
205  * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
206  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
207  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
208  *         out : 22,38,38,22, 22,38,38,22
209  * =============================================================================
210  */
__lsx_vdp2_h_b(__m128i in_h, __m128i in_l)211 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
212   __m128i out;
213 
214   out = __lsx_vmulwev_h_b(in_h, in_l);
215   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
216   return out;
217 }
218 
219 /*
220  * =============================================================================
221  * Description : Dot product of byte vector elements
222  * Arguments   : Inputs  - in_h, in_l
223  *               Outputs - out
224  *               Return Type - halfword
225  * Details     : Unsigned byte elements from in_h are multiplied by
226  *               unsigned byte elements from in_l, and then added adjacent to
227  *               each other to get results with the twice size of input.
228  * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
229  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
230  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
231  *         out : 22,38,38,22, 22,38,38,22
232  * =============================================================================
233  */
__lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)234 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
235   __m128i out;
236 
237   out = __lsx_vmulwev_h_bu(in_h, in_l);
238   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
239   return out;
240 }
241 
242 /*
243  * =============================================================================
244  * Description : Dot product of byte vector elements
245  * Arguments   : Inputs  - in_h, in_l
246  *               Outputs - out
247  *               Return Type - halfword
248  * Details     : Unsigned byte elements from in_h are multiplied by
249  *               signed byte elements from in_l, and then added adjacent to
250  *               each other to get results with the twice size of input.
251  * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
252  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
253  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
254  *         out : 22,38,38,22, 22,38,38,6
255  * =============================================================================
256  */
__lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)257 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
258   __m128i out;
259 
260   out = __lsx_vmulwev_h_bu_b(in_h, in_l);
261   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
262   return out;
263 }
264 
265 /*
266  * =============================================================================
267  * Description : Dot product of byte vector elements
268  * Arguments   : Inputs  - in_h, in_l
269  *               Outputs - out
270  *               Return Type - halfword
271  * Details     : Signed byte elements from in_h are multiplied by
272  *               signed byte elements from in_l, and then added adjacent to
273  *               each other to get results with the twice size of input.
274  * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
275  *        in_h : 1,2,3,4, 5,6,7,8
276  *        in_l : 8,7,6,5, 4,3,2,1
277  *         out : 22,38,38,22
278  * =============================================================================
279  */
__lsx_vdp2_w_h(__m128i in_h, __m128i in_l)280 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
281   __m128i out;
282 
283   out = __lsx_vmulwev_w_h(in_h, in_l);
284   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
285   return out;
286 }
287 
288 /*
289  * =============================================================================
290  * Description : Clip all halfword elements of input vector between min & max
291  *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
292  *               (_in))
293  * Arguments   : Inputs  - _in  (input vector)
294  *                       - min  (min threshold)
295  *                       - max  (max threshold)
296  *               Outputs - out  (output vector with clipped elements)
297  *               Return Type - signed halfword
298  * Example     : out = __lsx_vclip_h(_in)
299  *         _in : -8,2,280,249, -8,255,280,249
300  *         min : 1,1,1,1, 1,1,1,1
301  *         max : 9,9,9,9, 9,9,9,9
302  *         out : 1,2,9,9, 1,9,9,9
303  * =============================================================================
304  */
__lsx_vclip_h(__m128i _in, __m128i min, __m128i max)305 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
306   __m128i out;
307 
308   out = __lsx_vmax_h(min, _in);
309   out = __lsx_vmin_h(max, out);
310   return out;
311 }
312 
313 /*
314  * =============================================================================
315  * Description : Set each element of vector between 0 and 255
316  * Arguments   : Inputs  - _in
317  *               Outputs - out
318  *               Return Type - halfword
319  * Details     : Signed byte elements from _in are clamped between 0 and 255.
320  * Example     : out = __lsx_vclip255_h(_in)
321  *         _in : -8,255,280,249, -8,255,280,249
322  *         out : 0,255,255,249, 0,255,255,249
323  * =============================================================================
324  */
__lsx_vclip255_h(__m128i _in)325 static inline __m128i __lsx_vclip255_h(__m128i _in) {
326   __m128i out;
327 
328   out = __lsx_vmaxi_h(_in, 0);
329   out = __lsx_vsat_hu(out, 7);
330   return out;
331 }
332 
333 /*
334  * =============================================================================
335  * Description : Set each element of vector between 0 and 255
336  * Arguments   : Inputs  - _in
337  *               Outputs - out
338  *               Return Type - word
339  * Details     : Signed byte elements from _in are clamped between 0 and 255.
340  * Example     : out = __lsx_vclip255_w(_in)
341  *         _in : -8,255,280,249
342  *         out : 0,255,255,249
343  * =============================================================================
344  */
__lsx_vclip255_w(__m128i _in)345 static inline __m128i __lsx_vclip255_w(__m128i _in) {
346   __m128i out;
347 
348   out = __lsx_vmaxi_w(_in, 0);
349   out = __lsx_vsat_wu(out, 7);
350   return out;
351 }
352 
353 /*
354  * =============================================================================
355  * Description : Swap two variables
356  * Arguments   : Inputs  - _in0, _in1
357  *               Outputs - _in0, _in1 (in-place)
358  * Details     : Swapping of two input variables using xor
359  * Example     : LSX_SWAP(_in0, _in1)
360  *        _in0 : 1,2,3,4
361  *        _in1 : 5,6,7,8
362  *   _in0(out) : 5,6,7,8
363  *   _in1(out) : 1,2,3,4
364  * =============================================================================
365  */
366 #define LSX_SWAP(_in0, _in1)         \
367   {                                  \
368     _in0 = __lsx_vxor_v(_in0, _in1); \
369     _in1 = __lsx_vxor_v(_in0, _in1); \
370     _in0 = __lsx_vxor_v(_in0, _in1); \
371   }
372 
373 /*
374  * =============================================================================
375  * Description : Transpose 4x4 block with word elements in vectors
376  * Arguments   : Inputs  - in0, in1, in2, in3
377  *               Outputs - out0, out1, out2, out3
378  * Details     :
379  * Example     :
380  *               1, 2, 3, 4            1, 5, 9,13
381  *               5, 6, 7, 8    to      2, 6,10,14
382  *               9,10,11,12  =====>    3, 7,11,15
383  *              13,14,15,16            4, 8,12,16
384  * =============================================================================
385  */
386 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
387   {                                                                            \
388     __m128i _t0, _t1, _t2, _t3;                                                \
389                                                                                \
390     _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
391     _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
392     _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
393     _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
394     _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
395     _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
396     _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
397     _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
398   }
399 
400 /*
401  * =============================================================================
402  * Description : Transpose 8x8 block with byte elements in vectors
403  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
404  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
405  *               _out7
406  * Details     : The rows of the matrix become columns, and the columns
407  *               become rows.
408  * Example     : LSX_TRANSPOSE8x8_B
409  *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
410  *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
411  *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
412  *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
413  *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
414  *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
415  *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
416  *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
417  *
418  *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
419  *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
420  *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
421  *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
422  *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
423  *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
424  *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
425  *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
426  * =============================================================================
427  */
428 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
429                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
430                            _out7)                                           \
431   {                                                                         \
432     __m128i zero = { 0 };                                                   \
433     __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
434     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
435                                                                             \
436     _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
437     _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
438     _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
439     _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
440     _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
441     _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
442     _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
443     _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
444     _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
445     _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
446     _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
447     _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
448     _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
449     _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
450     _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
451     _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
452   }
453 
454 /*
455  * =============================================================================
456  * Description : Transpose 8x8 block with half-word elements in vectors
457  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
458  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
459  * Details     :
460  * Example     :
461  *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
462  *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
463  *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
464  *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
465  *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
466  *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
467  *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
468  *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
469  * =============================================================================
470  */
471 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
472                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
473                            _out7)                                           \
474   {                                                                         \
475     __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
476                                                                             \
477     _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
478     _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
479     _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
480     _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
481     _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
482     _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
483     _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
484     _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
485     _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
486     _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
487     _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
488     _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
489     _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
490     _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
491     _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
492     _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
493                                                                             \
494     _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
495     _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
496     _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
497     _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
498     _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
499     _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
500     _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
501     _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
502   }
503 
504 /*
505  * =============================================================================
506  * Description : Transpose input 8x4 byte block into 4x8
507  * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
508  *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
509  *               Return Type - as per RTYPE
510  * Details     : The rows of the matrix become columns, and the columns become
511  *               rows.
512  * Example     : LSX_TRANSPOSE8x4_B
513  *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
514  *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
515  *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
516  *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
517  *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
518  *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
519  *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
520  *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
521  *
522  *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
523  *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
524  *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
525  *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
526  * =============================================================================
527  */
528 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
529                            _out0, _out1, _out2, _out3)                     \
530   {                                                                        \
531     __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
532                                                                            \
533     _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
534     _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
535     _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
536     _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
537     _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
538                                                                            \
539     _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
540     _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
541     _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
542                                                                            \
543     _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
544     _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
545     _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
546     _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
547   }
548 
549 /*
550  * =============================================================================
551  * Description : Transpose 16x8 block with byte elements in vectors
552  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
553  *                         in9, in10, in11, in12, in13, in14, in15
554  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
555  * Details     :
556  * Example     :
557  *              000,001,002,003,004,005,006,007
558  *              008,009,010,011,012,013,014,015
559  *              016,017,018,019,020,021,022,023
560  *              024,025,026,027,028,029,030,031
561  *              032,033,034,035,036,037,038,039
562  *              040,041,042,043,044,045,046,047        000,008,...,112,120
563  *              048,049,050,051,052,053,054,055        001,009,...,113,121
564  *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
565  *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
566  *              072,073,074,075,076,077,078,079        004,012,...,116,124
567  *              080,081,082,083,084,085,086,087        005,013,...,117,125
568  *              088,089,090,091,092,093,094,095        006,014,...,118,126
569  *              096,097,098,099,100,101,102,103        007,015,...,119,127
570  *              104,105,106,107,108,109,110,111
571  *              112,113,114,115,116,117,118,119
572  *              120,121,122,123,124,125,126,127
573  * =============================================================================
574  */
575 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
576                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
577                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
578                             _out6, _out7)                                    \
579   {                                                                          \
580     __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
581     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
582     DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
583               _tmp0, _tmp1, _tmp2, _tmp3);                                   \
584     DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
585               _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
586     DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
587     DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
588     DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
589     DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
590     DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
591     DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
592     DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
593     DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
594     DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
595     DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
596     DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
597     DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
598   }
599 
600 /*
601  * =============================================================================
602  * Description : Butterfly of 4 input vectors
603  * Arguments   : Inputs  - in0, in1, in2, in3
604  *               Outputs - out0, out1, out2, out3
605  * Details     : Butterfly operation
606  * Example     :
607  *               out0 = in0 + in3;
608  *               out1 = in1 + in2;
609  *               out2 = in1 - in2;
610  *               out3 = in0 - in3;
611  * =============================================================================
612  */
613 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
614   {                                                                           \
615     _out0 = __lsx_vadd_b(_in0, _in3);                                         \
616     _out1 = __lsx_vadd_b(_in1, _in2);                                         \
617     _out2 = __lsx_vsub_b(_in1, _in2);                                         \
618     _out3 = __lsx_vsub_b(_in0, _in3);                                         \
619   }
620 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
621   {                                                                           \
622     _out0 = __lsx_vadd_h(_in0, _in3);                                         \
623     _out1 = __lsx_vadd_h(_in1, _in2);                                         \
624     _out2 = __lsx_vsub_h(_in1, _in2);                                         \
625     _out3 = __lsx_vsub_h(_in0, _in3);                                         \
626   }
627 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
628   {                                                                           \
629     _out0 = __lsx_vadd_w(_in0, _in3);                                         \
630     _out1 = __lsx_vadd_w(_in1, _in2);                                         \
631     _out2 = __lsx_vsub_w(_in1, _in2);                                         \
632     _out3 = __lsx_vsub_w(_in0, _in3);                                         \
633   }
634 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
635   {                                                                           \
636     _out0 = __lsx_vadd_d(_in0, _in3);                                         \
637     _out1 = __lsx_vadd_d(_in1, _in2);                                         \
638     _out2 = __lsx_vsub_d(_in1, _in2);                                         \
639     _out3 = __lsx_vsub_d(_in0, _in3);                                         \
640   }
641 
642 /*
643  * =============================================================================
644  * Description : Butterfly of 8 input vectors
645  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
646  *               Outputs - _out0, _out1, _out2, _out3, ~
647  * Details     : Butterfly operation
648  * Example     :
649  *              _out0 = _in0 + _in7;
650  *              _out1 = _in1 + _in6;
651  *              _out2 = _in2 + _in5;
652  *              _out3 = _in3 + _in4;
653  *              _out4 = _in3 - _in4;
654  *              _out5 = _in2 - _in5;
655  *              _out6 = _in1 - _in6;
656  *              _out7 = _in0 - _in7;
657  * =============================================================================
658  */
659 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
660                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
661                           _out7)                                           \
662   {                                                                        \
663     _out0 = __lsx_vadd_b(_in0, _in7);                                      \
664     _out1 = __lsx_vadd_b(_in1, _in6);                                      \
665     _out2 = __lsx_vadd_b(_in2, _in5);                                      \
666     _out3 = __lsx_vadd_b(_in3, _in4);                                      \
667     _out4 = __lsx_vsub_b(_in3, _in4);                                      \
668     _out5 = __lsx_vsub_b(_in2, _in5);                                      \
669     _out6 = __lsx_vsub_b(_in1, _in6);                                      \
670     _out7 = __lsx_vsub_b(_in0, _in7);                                      \
671   }
672 
673 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
674                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
675                           _out7)                                           \
676   {                                                                        \
677     _out0 = __lsx_vadd_h(_in0, _in7);                                      \
678     _out1 = __lsx_vadd_h(_in1, _in6);                                      \
679     _out2 = __lsx_vadd_h(_in2, _in5);                                      \
680     _out3 = __lsx_vadd_h(_in3, _in4);                                      \
681     _out4 = __lsx_vsub_h(_in3, _in4);                                      \
682     _out5 = __lsx_vsub_h(_in2, _in5);                                      \
683     _out6 = __lsx_vsub_h(_in1, _in6);                                      \
684     _out7 = __lsx_vsub_h(_in0, _in7);                                      \
685   }
686 
687 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
688                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
689                           _out7)                                           \
690   {                                                                        \
691     _out0 = __lsx_vadd_w(_in0, _in7);                                      \
692     _out1 = __lsx_vadd_w(_in1, _in6);                                      \
693     _out2 = __lsx_vadd_w(_in2, _in5);                                      \
694     _out3 = __lsx_vadd_w(_in3, _in4);                                      \
695     _out4 = __lsx_vsub_w(_in3, _in4);                                      \
696     _out5 = __lsx_vsub_w(_in2, _in5);                                      \
697     _out6 = __lsx_vsub_w(_in1, _in6);                                      \
698     _out7 = __lsx_vsub_w(_in0, _in7);                                      \
699   }
700 
701 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
702                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
703                           _out7)                                           \
704   {                                                                        \
705     _out0 = __lsx_vadd_d(_in0, _in7);                                      \
706     _out1 = __lsx_vadd_d(_in1, _in6);                                      \
707     _out2 = __lsx_vadd_d(_in2, _in5);                                      \
708     _out3 = __lsx_vadd_d(_in3, _in4);                                      \
709     _out4 = __lsx_vsub_d(_in3, _in4);                                      \
710     _out5 = __lsx_vsub_d(_in2, _in5);                                      \
711     _out6 = __lsx_vsub_d(_in1, _in6);                                      \
712     _out7 = __lsx_vsub_d(_in0, _in7);                                      \
713   }
714 
715 #endif  // LSX
716 
717 #ifdef __loongarch_asx
718 #include <lasxintrin.h>
719 /*
720  * =============================================================================
721  * Description : Dot product of byte vector elements
722  * Arguments   : Inputs - in_h, in_l
723  *               Output - out
724  *               Return Type - signed halfword
725  * Details     : Unsigned byte elements from in_h are multiplied with
726  *               unsigned byte elements from in_l producing a result
727  *               twice the size of input i.e. signed halfword.
728  *               Then this multiplied results of adjacent odd-even elements
729  *               are added to the out vector
730  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
731  * =============================================================================
732  */
__lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)733 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
734   __m256i out;
735 
736   out = __lasx_xvmulwev_h_bu(in_h, in_l);
737   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
738   return out;
739 }
740 
741 /*
742  * =============================================================================
743  * Description : Dot product of byte vector elements
744  * Arguments   : Inputs - in_h, in_l
745  *               Output - out
746  *               Return Type - signed halfword
747  * Details     : Signed byte elements from in_h are multiplied with
748  *               signed byte elements from in_l producing a result
749  *               twice the size of input i.e. signed halfword.
750  *               Then this multiplication results of adjacent odd-even elements
751  *               are added to the out vector
752  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
753  * =============================================================================
754  */
__lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)755 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
756   __m256i out;
757 
758   out = __lasx_xvmulwev_h_b(in_h, in_l);
759   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
760   return out;
761 }
762 
763 /*
764  * =============================================================================
765  * Description : Dot product of halfword vector elements
766  * Arguments   : Inputs - in_h, in_l
767  *               Output - out
768  *               Return Type - signed word
769  * Details     : Signed halfword elements from in_h are multiplied with
770  *               signed halfword elements from in_l producing a result
771  *               twice the size of input i.e. signed word.
772  *               Then this multiplied results of adjacent odd-even elements
773  *               are added to the out vector.
774  * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
775  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
776  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
777  *         out : 22,38,38,22, 22,38,38,22
778  * =============================================================================
779  */
__lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)780 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
781   __m256i out;
782 
783   out = __lasx_xvmulwev_w_h(in_h, in_l);
784   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
785   return out;
786 }
787 
788 /*
789  * =============================================================================
790  * Description : Dot product of word vector elements
791  * Arguments   : Inputs - in_h, in_l
792  *               Output - out
793  *               Return Type - signed double
794  * Details     : Signed word elements from in_h are multiplied with
795  *               signed word elements from in_l producing a result
796  *               twice the size of input i.e. signed double-word.
797  *               Then this multiplied results of adjacent odd-even elements
798  *               are added to the out vector.
799  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
800  * =============================================================================
801  */
__lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)802 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
803   __m256i out;
804 
805   out = __lasx_xvmulwev_d_w(in_h, in_l);
806   out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
807   return out;
808 }
809 
810 /*
811  * =============================================================================
812  * Description : Dot product of halfword vector elements
813  * Arguments   : Inputs - in_h, in_l
814  *               Output - out
815  *               Return Type - signed word
816  * Details     : Unsigned halfword elements from in_h are multiplied with
817  *               signed halfword elements from in_l producing a result
818  *               twice the size of input i.e. unsigned word.
819  *               Multiplication result of adjacent odd-even elements
820  *               are added to the out vector
821  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
822  * =============================================================================
823  */
__lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)824 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
825   __m256i out;
826 
827   out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
828   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
829   return out;
830 }
831 
832 /*
833  * =============================================================================
834  * Description : Dot product & addition of byte vector elements
835  * Arguments   : Inputs - in_h, in_l
836  *               Output - out
837  *               Return Type - halfword
838  * Details     : Signed byte elements from in_h are multiplied with
839  *               signed byte elements from in_l producing a result
840  *               twice the size of input i.e. signed halfword.
841  *               Then this multiplied results of adjacent odd-even elements
842  *               are added to the in_c vector.
843  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
844  * =============================================================================
845  */
__lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, __m256i in_l)846 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
847                                           __m256i in_l) {
848   __m256i out;
849 
850   out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
851   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
852   return out;
853 }
854 
855 /*
856  * =============================================================================
857  * Description : Dot product & addition of byte vector elements
858  * Arguments   : Inputs - in_h, in_l
859  *               Output - out
860  *               Return Type - halfword
861  * Details     : Unsigned byte elements from in_h are multiplied with
862  *               unsigned byte elements from in_l producing a result
863  *               twice the size of input i.e. signed halfword.
864  *               Then this multiplied results of adjacent odd-even elements
865  *               are added to the in_c vector.
866  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
867  * =============================================================================
868  */
__lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)869 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
870                                            __m256i in_l) {
871   __m256i out;
872 
873   out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
874   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
875   return out;
876 }
877 
878 /*
879  * =============================================================================
880  * Description : Dot product & addition of byte vector elements
881  * Arguments   : Inputs - in_h, in_l
882  *               Output - out
883  *               Return Type - halfword
884  * Details     : Unsigned byte elements from in_h are multiplied with
885  *               signed byte elements from in_l producing a result
886  *               twice the size of input i.e. signed halfword.
887  *               Then this multiplied results of adjacent odd-even elements
888  *               are added to the in_c vector.
889  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
890  * =============================================================================
891  */
__lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, __m256i in_l)892 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
893                                              __m256i in_l) {
894   __m256i out;
895 
896   out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
897   out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
898   return out;
899 }
900 
901 /*
902  * =============================================================================
903  * Description : Dot product of halfword vector elements
904  * Arguments   : Inputs - in_c, in_h, in_l
905  *               Output - out
906  *               Return Type - per RTYPE
907  * Details     : Signed halfword elements from in_h are multiplied with
908  *               signed halfword elements from in_l producing a result
909  *               twice the size of input i.e. signed word.
910  *               Multiplication result of adjacent odd-even elements
911  *               are added to the in_c vector.
912  * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
913  *        in_c : 1,2,3,4, 1,2,3,4
914  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
915  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
916  *         out : 23,40,41,26, 23,40,41,26
917  * =============================================================================
918  */
__lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)919 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
920                                           __m256i in_l) {
921   __m256i out;
922 
923   out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
924   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
925   return out;
926 }
927 
928 /*
929  * =============================================================================
930  * Description : Dot product of halfword vector elements
931  * Arguments   : Inputs - in_c, in_h, in_l
932  *               Output - out
933  *               Return Type - signed word
934  * Details     : Unsigned halfword elements from in_h are multiplied with
935  *               unsigned halfword elements from in_l producing a result
936  *               twice the size of input i.e. signed word.
937  *               Multiplication result of adjacent odd-even elements
938  *               are added to the in_c vector.
939  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
940  * =============================================================================
941  */
__lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)942 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
943                                            __m256i in_l) {
944   __m256i out;
945 
946   out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
947   out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
948   return out;
949 }
950 
951 /*
952  * =============================================================================
953  * Description : Dot product of halfword vector elements
954  * Arguments   : Inputs - in_c, in_h, in_l
955  *               Output - out
956  *               Return Type - signed word
957  * Details     : Unsigned halfword elements from in_h are multiplied with
958  *               signed halfword elements from in_l producing a result
959  *               twice the size of input i.e. signed word.
960  *               Multiplication result of adjacent odd-even elements
961  *               are added to the in_c vector
962  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
963  * =============================================================================
964  */
__lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)965 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
966                                              __m256i in_l) {
967   __m256i out;
968 
969   out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
970   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
971   return out;
972 }
973 
974 /*
975  * =============================================================================
976  * Description : Vector Unsigned Dot Product and Subtract
977  * Arguments   : Inputs - in_c, in_h, in_l
978  *               Output - out
979  *               Return Type - signed halfword
980  * Details     : Unsigned byte elements from in_h are multiplied with
981  *               unsigned byte elements from in_l producing a result
982  *               twice the size of input i.e. signed halfword.
983  *               Multiplication result of adjacent odd-even elements
984  *               are added together and subtracted from double width elements
985  *               in_c vector.
986  * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
987  * =============================================================================
988  */
__lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)989 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
990                                            __m256i in_l) {
991   __m256i out;
992 
993   out = __lasx_xvmulwev_h_bu(in_h, in_l);
994   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
995   out = __lasx_xvsub_h(in_c, out);
996   return out;
997 }
998 
999 /*
1000  * =============================================================================
1001  * Description : Vector Signed Dot Product and Subtract
1002  * Arguments   : Inputs - in_c, in_h, in_l
1003  *               Output - out
1004  *               Return Type - signed word
1005  * Details     : Signed halfword elements from in_h are multiplied with
1006  *               Signed halfword elements from in_l producing a result
1007  *               twice the size of input i.e. signed word.
1008  *               Multiplication result of adjacent odd-even elements
1009  *               are added together and subtracted from double width elements
1010  *               in_c vector.
1011  * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1012  *        in_c : 0,0,0,0, 0,0,0,0
1013  *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1014  *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1015  *         out : -7,-3,0,0, 0,-1,0,-1
1016  * =============================================================================
1017  */
__lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)1018 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
1019                                           __m256i in_l) {
1020   __m256i out;
1021 
1022   out = __lasx_xvmulwev_w_h(in_h, in_l);
1023   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1024   out = __lasx_xvsub_w(in_c, out);
1025   return out;
1026 }
1027 
1028 /*
1029  * =============================================================================
1030  * Description : Dot product of halfword vector elements
1031  * Arguments   : Inputs - in_h, in_l
1032  *               Output - out
1033  *               Return Type - signed word
1034  * Details     : Signed halfword elements from in_h are multiplied with
1035  *               signed halfword elements from in_l producing a result
1036  *               four times the size of input i.e. signed doubleword.
1037  *               Then this multiplication results of four adjacent elements
1038  *               are added together and stored to the out vector.
1039  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
1040  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1041  *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1042  *         out : -2,0,1,1
1043  * =============================================================================
1044  */
__lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)1045 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1046   __m256i out;
1047 
1048   out = __lasx_xvmulwev_w_h(in_h, in_l);
1049   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1050   out = __lasx_xvhaddw_d_w(out, out);
1051   return out;
1052 }
1053 
1054 /*
1055  * =============================================================================
1056  * Description : The high half of the vector elements are expanded and
1057  *               added after being doubled.
1058  * Arguments   : Inputs - in_h, in_l
1059  *               Output - out
1060  * Details     : The in_h vector and the in_l vector are added after the
1061  *               higher half of the two-fold sign extension (signed byte
1062  *               to signed halfword) and stored to the out vector.
1063  * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1064  * =============================================================================
1065  */
__lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)1066 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1067   __m256i out;
1068 
1069   out = __lasx_xvilvh_b(in_h, in_l);
1070   out = __lasx_xvhaddw_h_b(out, out);
1071   return out;
1072 }
1073 
1074 /*
1075  * =============================================================================
1076  * Description : The high half of the vector elements are expanded and
1077  *               added after being doubled.
1078  * Arguments   : Inputs - in_h, in_l
1079  *               Output - out
1080  * Details     : The in_h vector and the in_l vector are added after the
1081  *               higher half of the two-fold sign extension (signed halfword
1082  *               to signed word) and stored to the out vector.
1083  * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
1084  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1085  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1086  *         out : 1,0,0,-1, 1,0,0, 2
1087  * =============================================================================
1088  */
__lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)1089 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1090   __m256i out;
1091 
1092   out = __lasx_xvilvh_h(in_h, in_l);
1093   out = __lasx_xvhaddw_w_h(out, out);
1094   return out;
1095 }
1096 
1097 /*
1098  * =============================================================================
1099  * Description : The low half of the vector elements are expanded and
1100  *               added after being doubled.
1101  * Arguments   : Inputs - in_h, in_l
1102  *               Output - out
1103  * Details     : The in_h vector and the in_l vector are added after the
1104  *               lower half of the two-fold sign extension (signed byte
1105  *               to signed halfword) and stored to the out vector.
1106  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1107  * =============================================================================
1108  */
__lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)1109 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1110   __m256i out;
1111 
1112   out = __lasx_xvilvl_b(in_h, in_l);
1113   out = __lasx_xvhaddw_h_b(out, out);
1114   return out;
1115 }
1116 
1117 /*
1118  * =============================================================================
1119  * Description : The low half of the vector elements are expanded and
1120  *               added after being doubled.
1121  * Arguments   : Inputs - in_h, in_l
1122  *               Output - out
1123  * Details     : The in_h vector and the in_l vector are added after the
1124  *               lower half of the two-fold sign extension (signed halfword
1125  *               to signed word) and stored to the out vector.
1126  * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
1127  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1128  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1129  *         out : 5,-1,4,2, 1,0,2,-1
1130  * =============================================================================
1131  */
__lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)1132 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1133   __m256i out;
1134 
1135   out = __lasx_xvilvl_h(in_h, in_l);
1136   out = __lasx_xvhaddw_w_h(out, out);
1137   return out;
1138 }
1139 
1140 /*
1141  * =============================================================================
1142  * Description : The low half of the vector elements are expanded and
1143  *               added after being doubled.
1144  * Arguments   : Inputs - in_h, in_l
1145  *               Output - out
1146  * Details     : The out vector and the out vector are added after the
1147  *               lower half of the two-fold zero extension (unsigned byte
1148  *               to unsigned halfword) and stored to the out vector.
1149  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1150  * =============================================================================
1151  */
__lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)1152 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1153   __m256i out;
1154 
1155   out = __lasx_xvilvl_b(in_h, in_l);
1156   out = __lasx_xvhaddw_hu_bu(out, out);
1157   return out;
1158 }
1159 
1160 /*
1161  * =============================================================================
1162  * Description : The low half of the vector elements are expanded and
1163  *               added after being doubled.
1164  * Arguments   : Inputs - in_h, in_l
1165  *               Output - out
1166  * Details     : The in_l vector after double zero extension (unsigned byte to
1167  *               signed halfword),added to the in_h vector.
1168  * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1169  * =============================================================================
1170  */
__lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)1171 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1172   __m256i out;
1173 
1174   out = __lasx_xvsllwil_hu_bu(in_l, 0);
1175   out = __lasx_xvadd_h(in_h, out);
1176   return out;
1177 }
1178 
1179 /*
1180  * =============================================================================
1181  * Description : The low half of the vector elements are expanded and
1182  *               added after being doubled.
1183  * Arguments   : Inputs - in_h, in_l
1184  *               Output - out
1185  * Details     : The in_l vector after double sign extension (signed halfword to
1186  *               signed word), added to the in_h vector.
1187  * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1188  *        in_h : 0, 1,0,0, -1,0,0,1,
1189  *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
1190  *         out : 2, 0,1,2, -1,0,1,1,
1191  * =============================================================================
1192  */
__lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)1193 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1194   __m256i out;
1195 
1196   out = __lasx_xvsllwil_w_h(in_l, 0);
1197   out = __lasx_xvadd_w(in_h, out);
1198   return out;
1199 }
1200 
1201 /*
1202  * =============================================================================
1203  * Description : Multiplication and addition calculation after expansion
1204  *               of the lower half of the vector.
1205  * Arguments   : Inputs - in_c, in_h, in_l
1206  *               Output - out
1207  * Details     : The in_h vector and the in_l vector are multiplied after
1208  *               the lower half of the two-fold sign extension (signed halfword
1209  *               to signed word), and the result is added to the vector in_c,
1210  *               then stored to the out vector.
1211  * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1212  *        in_c : 1,2,3,4, 5,6,7,8
1213  *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1214  *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
1215  *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
1216  *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1217  * =============================================================================
1218  */
__lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)1219 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1220                                           __m256i in_l) {
1221   __m256i tmp0, tmp1, out;
1222 
1223   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1224   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1225   tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1226   out = __lasx_xvadd_w(tmp0, in_c);
1227   return out;
1228 }
1229 
1230 /*
1231  * =============================================================================
1232  * Description : Multiplication and addition calculation after expansion
1233  *               of the higher half of the vector.
1234  * Arguments   : Inputs - in_c, in_h, in_l
1235  *               Output - out
1236  * Details     : The in_h vector and the in_l vector are multiplied after
1237  *               the higher half of the two-fold sign extension (signed
1238  *               halfword to signed word), and the result is added to
1239  *               the vector in_c, then stored to the out vector.
1240  * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1241  * =============================================================================
1242  */
__lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)1243 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1244                                           __m256i in_l) {
1245   __m256i tmp0, tmp1, out;
1246 
1247   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1248   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1249   tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1250   out = __lasx_xvadd_w(tmp0, in_c);
1251   return out;
1252 }
1253 
1254 /*
1255  * =============================================================================
1256  * Description : Multiplication calculation after expansion of the lower
1257  *               half of the vector.
1258  * Arguments   : Inputs - in_h, in_l
1259  *               Output - out
1260  * Details     : The in_h vector and the in_l vector are multiplied after
1261  *               the lower half of the two-fold sign extension (signed
1262  *               halfword to signed word), then stored to the out vector.
1263  * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
1264  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1265  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1266  *         out : 6,1,3,0, 0,0,1,0
1267  * =============================================================================
1268  */
__lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)1269 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1270   __m256i tmp0, tmp1, out;
1271 
1272   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1273   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1274   out = __lasx_xvmul_w(tmp0, tmp1);
1275   return out;
1276 }
1277 
1278 /*
1279  * =============================================================================
1280  * Description : Multiplication calculation after expansion of the lower
1281  *               half of the vector.
1282  * Arguments   : Inputs - in_h, in_l
1283  *               Output - out
1284  * Details     : The in_h vector and the in_l vector are multiplied after
1285  *               the lower half of the two-fold sign extension (signed
1286  *               halfword to signed word), then stored to the out vector.
1287  * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
1288  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1289  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1290  *         out : 0,0,0,0, 0,0,0,1
1291  * =============================================================================
1292  */
__lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)1293 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1294   __m256i tmp0, tmp1, out;
1295 
1296   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1297   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1298   out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1299   return out;
1300 }
1301 
1302 /*
1303  * =============================================================================
1304  * Description : The low half of the vector elements are added to the high half
1305  *               after being doubled, then saturated.
1306  * Arguments   : Inputs - in_h, in_l
1307  *               Output - out
1308  * Details     : The in_h vector adds the in_l vector after the lower half of
1309  *               the two-fold zero extension (unsigned byte to unsigned
1310  *               halfword) and then saturated. The results are stored to the out
1311  *               vector.
1312  * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1313  *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1314  *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1315  *               0,0,0,1
1316  *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1317  * =============================================================================
1318  */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)1319 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1320   __m256i tmp1, out;
1321   __m256i zero = { 0 };
1322 
1323   tmp1 = __lasx_xvilvl_b(zero, in_l);
1324   out = __lasx_xvsadd_hu(in_h, tmp1);
1325   return out;
1326 }
1327 
1328 /*
1329  * =============================================================================
1330  * Description : Clip all halfword elements of input vector between min & max
1331  *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1332  * Arguments   : Inputs  - in    (input vector)
1333  *                       - min   (min threshold)
1334  *                       - max   (max threshold)
1335  *               Outputs - in    (output vector with clipped elements)
1336  *               Return Type - signed halfword
1337  * Example     : out = __lasx_xvclip_h(in, min, max)
1338  *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1339  *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1340  *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1341  *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1342  * =============================================================================
1343  */
__lasx_xvclip_h(__m256i in, __m256i min, __m256i max)1344 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1345   __m256i out;
1346 
1347   out = __lasx_xvmax_h(min, in);
1348   out = __lasx_xvmin_h(max, out);
1349   return out;
1350 }
1351 
1352 /*
1353  * =============================================================================
1354  * Description : Clip all signed halfword elements of input vector
1355  *               between 0 & 255
1356  * Arguments   : Inputs  - in   (input vector)
1357  *               Outputs - out  (output vector with clipped elements)
1358  *               Return Type - signed halfword
1359  * Example     : See out = __lasx_xvclip255_w(in)
1360  * =============================================================================
1361  */
__lasx_xvclip255_h(__m256i in)1362 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1363   __m256i out;
1364 
1365   out = __lasx_xvmaxi_h(in, 0);
1366   out = __lasx_xvsat_hu(out, 7);
1367   return out;
1368 }
1369 
1370 /*
1371  * =============================================================================
1372  * Description : Clip all signed word elements of input vector
1373  *               between 0 & 255
1374  * Arguments   : Inputs - in   (input vector)
1375  *               Output - out  (output vector with clipped elements)
1376  *               Return Type - signed word
1377  * Example     : out = __lasx_xvclip255_w(in)
1378  *          in : -8,255,280,249, -8,255,280,249
1379  *         out :  0,255,255,249,  0,255,255,249
1380  * =============================================================================
1381  */
__lasx_xvclip255_w(__m256i in)1382 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1383   __m256i out;
1384 
1385   out = __lasx_xvmaxi_w(in, 0);
1386   out = __lasx_xvsat_wu(out, 7);
1387   return out;
1388 }
1389 
1390 /*
1391  * =============================================================================
1392  * Description : Indexed halfword element values are replicated to all
1393  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1394  *               if 'idx >= 8' use xvsplati_h_*.
1395  * Arguments   : Inputs - in, idx
1396  *               Output - out
1397  * Details     : Idx element value from in vector is replicated to all
1398  *               elements in out vector.
1399  *               Valid index range for halfword operation is 0-7
1400  * Example     : out = __lasx_xvsplati_l_h(in, idx)
1401  *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1402  *         idx : 0x02
1403  *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1404  * =============================================================================
1405  */
__lasx_xvsplati_l_h(__m256i in, int idx)1406 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1407   __m256i out;
1408 
1409   out = __lasx_xvpermi_q(in, in, 0x02);
1410   out = __lasx_xvreplve_h(out, idx);
1411   return out;
1412 }
1413 
1414 /*
1415  * =============================================================================
1416  * Description : Indexed halfword element values are replicated to all
1417  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1418  *               if 'idx >= 8' use xvsplati_h_*.
1419  * Arguments   : Inputs - in, idx
1420  *               Output - out
1421  * Details     : Idx element value from in vector is replicated to all
1422  *               elements in out vector.
1423  *               Valid index range for halfword operation is 0-7
1424  * Example     : out = __lasx_xvsplati_h_h(in, idx)
1425  *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1426  *         idx : 0x09
1427  *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1428  * =============================================================================
1429  */
__lasx_xvsplati_h_h(__m256i in, int idx)1430 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1431   __m256i out;
1432 
1433   out = __lasx_xvpermi_q(in, in, 0x13);
1434   out = __lasx_xvreplve_h(out, idx);
1435   return out;
1436 }
1437 
1438 /*
1439  * =============================================================================
1440  * Description : Transpose 4x4 block with double-word elements in vectors
1441  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1442  *               Outputs - _out0, _out1, _out2, _out3
1443  * Example     : LASX_TRANSPOSE4x4_D
1444  *        _in0 : 1,2,3,4
1445  *        _in1 : 1,2,3,4
1446  *        _in2 : 1,2,3,4
1447  *        _in3 : 1,2,3,4
1448  *
1449  *       _out0 : 1,1,1,1
1450  *       _out1 : 2,2,2,2
1451  *       _out2 : 3,3,3,3
1452  *       _out3 : 4,4,4,4
1453  * =============================================================================
1454  */
1455 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1456                             _out3)                                       \
1457   {                                                                      \
1458     __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
1459     _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
1460     _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
1461     _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
1462     _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
1463     _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
1464     _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
1465     _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
1466     _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
1467   }
1468 
1469 /*
1470  * =============================================================================
1471  * Description : Transpose 8x8 block with word elements in vectors
1472  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1473  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1474  *               _out7
1475  * Example     : LASX_TRANSPOSE8x8_W
1476  *        _in0 : 1,2,3,4,5,6,7,8
1477  *        _in1 : 2,2,3,4,5,6,7,8
1478  *        _in2 : 3,2,3,4,5,6,7,8
1479  *        _in3 : 4,2,3,4,5,6,7,8
1480  *        _in4 : 5,2,3,4,5,6,7,8
1481  *        _in5 : 6,2,3,4,5,6,7,8
1482  *        _in6 : 7,2,3,4,5,6,7,8
1483  *        _in7 : 8,2,3,4,5,6,7,8
1484  *
1485  *       _out0 : 1,2,3,4,5,6,7,8
1486  *       _out1 : 2,2,2,2,2,2,2,2
1487  *       _out2 : 3,3,3,3,3,3,3,3
1488  *       _out3 : 4,4,4,4,4,4,4,4
1489  *       _out4 : 5,5,5,5,5,5,5,5
1490  *       _out5 : 6,6,6,6,6,6,6,6
1491  *       _out6 : 7,7,7,7,7,7,7,7
1492  *       _out7 : 8,8,8,8,8,8,8,8
1493  * =============================================================================
1494  */
1495 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1496                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1497                             _out7)                                           \
1498   {                                                                          \
1499     __m256i _s0_m, _s1_m;                                                    \
1500     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1501     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1502                                                                              \
1503     _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
1504     _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
1505     _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1506     _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1507     _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
1508     _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
1509     _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1510     _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1511     _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
1512     _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
1513     _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1514     _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1515     _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
1516     _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
1517     _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1518     _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1519     _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
1520     _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
1521     _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
1522     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
1523     _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
1524     _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
1525     _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
1526     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
1527   }
1528 
1529 /*
1530  * =============================================================================
1531  * Description : Transpose input 16x8 byte block
1532  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1533  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1534  *                         (input 16x8 byte block)
1535  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1536  *                         _out7 (output 8x16 byte block)
1537  * Details     : The rows of the matrix become columns, and the columns become
1538  *               rows.
1539  * Example     : See LASX_TRANSPOSE16x8_H
1540  * =============================================================================
1541  */
1542 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1543                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1544                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1545                              _out6, _out7)                                    \
1546   {                                                                           \
1547     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1548     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1549                                                                               \
1550     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
1551     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
1552     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
1553     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
1554     _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
1555     _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
1556     _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
1557     _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
1558     _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
1559     _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
1560     _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
1561     _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
1562     _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
1563     _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
1564     _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
1565     _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
1566     _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
1567     _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
1568     _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
1569     _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
1570     _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
1571     _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
1572     _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
1573     _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
1574     _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
1575     _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
1576     _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
1577     _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
1578     _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
1579     _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
1580     _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
1581     _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
1582   }
1583 
1584 /*
1585  * =============================================================================
1586  * Description : Transpose input 16x8 byte block
1587  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1588  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1589  *                         (input 16x8 byte block)
1590  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1591  *                         _out7 (output 8x16 byte block)
1592  * Details     : The rows of the matrix become columns, and the columns become
1593  *               rows.
1594  * Example     : LASX_TRANSPOSE16x8_H
1595  *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1596  *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1597  *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1598  *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1599  *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1600  *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1601  *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1602  *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1603  *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1604  *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1605  *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1606  *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1607  *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1608  *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1609  *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1610  *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1611  *
1612  *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1613  *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1614  *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1615  *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1616  *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1617  *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1618  *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1619  *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1620  * =============================================================================
1621  */
1622 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1623                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1624                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1625                              _out6, _out7)                                    \
1626   {                                                                           \
1627     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1628     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1629     __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
1630                                                                               \
1631     _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
1632     _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
1633     _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
1634     _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
1635     _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
1636     _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
1637     _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
1638     _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
1639     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1640     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1641     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1642     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1643     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1644     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1645     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1646     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1647     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1648     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1649     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1650     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1651     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1652     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1653     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1654     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1655     _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1656     _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1657     _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1658     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1659                                                                               \
1660     _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
1661     _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
1662     _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
1663     _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
1664     _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
1665     _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
1666     _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
1667     _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
1668     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1669     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1670     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1671     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1672     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1673     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1674     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1675     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1676     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1677     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1678     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1679     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1680     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1681     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1682     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1683     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1684     _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1685     _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1686     _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1687     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1688   }
1689 
1690 /*
1691  * =============================================================================
1692  * Description : Transpose 4x4 block with halfword elements in vectors
1693  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1694  *               Outputs - _out0, _out1, _out2, _out3
1695  *               Return Type - signed halfword
1696  * Details     : The rows of the matrix become columns, and the columns become
1697  *               rows.
1698  * Example     : See LASX_TRANSPOSE8x8_H
1699  * =============================================================================
1700  */
1701 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1702                             _out3)                                       \
1703   {                                                                      \
1704     __m256i _s0_m, _s1_m;                                                \
1705                                                                          \
1706     _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
1707     _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
1708     _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
1709     _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
1710     _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
1711     _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
1712   }
1713 
1714 /*
1715  * =============================================================================
1716  * Description : Transpose input 8x8 byte block
1717  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1718  *                         (input 8x8 byte block)
1719  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1720  *                         _out7 (output 8x8 byte block)
1721  * Example     : See LASX_TRANSPOSE8x8_H
1722  * =============================================================================
1723  */
1724 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1725                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1726                             _out7)                                           \
1727   {                                                                          \
1728     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1729     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1730     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
1731     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
1732     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
1733     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
1734     _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
1735     _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
1736     _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
1737     _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
1738     _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
1739     _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
1740     _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
1741     _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
1742     _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
1743     _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
1744     _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
1745     _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
1746   }
1747 
1748 /*
1749  * =============================================================================
1750  * Description : Transpose 8x8 block with halfword elements in vectors.
1751  * Arguments   : Inputs  - _in0, _in1, ~
1752  *               Outputs - _out0, _out1, ~
1753  * Details     : The rows of the matrix become columns, and the columns become
1754  *               rows.
1755  * Example     : LASX_TRANSPOSE8x8_H
1756  *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1757  *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1758  *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1759  *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1760  *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1761  *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1762  *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1763  *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1764  *
1765  *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1766  *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1767  *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1768  *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1769  *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1770  *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1771  *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1772  *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1773  * =============================================================================
1774  */
1775 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1776                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1777                             _out7)                                           \
1778   {                                                                          \
1779     __m256i _s0_m, _s1_m;                                                    \
1780     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1781     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1782                                                                              \
1783     _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
1784     _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
1785     _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1786     _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1787     _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
1788     _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
1789     _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1790     _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1791                                                                              \
1792     _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
1793     _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
1794     _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1795     _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1796     _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
1797     _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
1798     _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1799     _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1800                                                                              \
1801     _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
1802     _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
1803     _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
1804     _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
1805     _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
1806     _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
1807     _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
1808     _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
1809   }
1810 
1811 /*
1812  * =============================================================================
1813  * Description : Butterfly of 4 input vectors
1814  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1815  *               Outputs - _out0, _out1, _out2, _out3
1816  * Details     : Butterfly operation
1817  * Example     : LASX_BUTTERFLY_4
1818  *               _out0 = _in0 + _in3;
1819  *               _out1 = _in1 + _in2;
1820  *               _out2 = _in1 - _in2;
1821  *               _out3 = _in0 - _in3;
1822  * =============================================================================
1823  */
1824 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1825   {                                                                            \
1826     _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
1827     _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
1828     _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
1829     _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
1830   }
1831 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1832   {                                                                            \
1833     _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
1834     _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
1835     _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
1836     _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
1837   }
1838 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1839   {                                                                            \
1840     _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
1841     _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
1842     _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
1843     _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
1844   }
1845 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1846   {                                                                            \
1847     _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
1848     _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
1849     _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
1850     _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
1851   }
1852 
1853 /*
1854  * =============================================================================
1855  * Description : Butterfly of 8 input vectors
1856  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
1857  *               Outputs - _out0, _out1, _out2, _out3, ~
1858  * Details     : Butterfly operation
1859  * Example     : LASX_BUTTERFLY_8
1860  *               _out0 = _in0 + _in7;
1861  *               _out1 = _in1 + _in6;
1862  *               _out2 = _in2 + _in5;
1863  *               _out3 = _in3 + _in4;
1864  *               _out4 = _in3 - _in4;
1865  *               _out5 = _in2 - _in5;
1866  *               _out6 = _in1 - _in6;
1867  *               _out7 = _in0 - _in7;
1868  * =============================================================================
1869  */
1870 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1871                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1872                            _out7)                                           \
1873   {                                                                         \
1874     _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
1875     _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
1876     _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
1877     _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
1878     _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
1879     _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
1880     _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
1881     _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
1882   }
1883 
1884 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1885                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1886                            _out7)                                           \
1887   {                                                                         \
1888     _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
1889     _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
1890     _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
1891     _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
1892     _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
1893     _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
1894     _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
1895     _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
1896   }
1897 
1898 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1899                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1900                            _out7)                                           \
1901   {                                                                         \
1902     _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
1903     _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
1904     _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
1905     _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
1906     _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
1907     _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
1908     _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
1909     _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
1910   }
1911 
1912 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1913                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1914                            _out7)                                           \
1915   {                                                                         \
1916     _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
1917     _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
1918     _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
1919     _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
1920     _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
1921     _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
1922     _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
1923     _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
1924   }
1925 
1926 #endif  // LASX
1927 
1928 /*
1929  * =============================================================================
1930  * Description : Print out elements in vector.
1931  * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
1932  *               Outputs -
1933  * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1934  *               '_enter' is TRUE, prefix "\nVP:" will be added first.
1935  * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1936  *               VP:1,2,3,4,
1937  * =============================================================================
1938  */
1939 #define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
1940   {                                                                \
1941     RTYPE _tmp0 = (RTYPE)in0;                                      \
1942     int _i = 0;                                                    \
1943     if (enter) printf("\nVP:");                                    \
1944     for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
1945   }
1946 
1947 #endif /* LOONGSON_INTRINSICS_H */
1948 #endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
1949