1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited
3cabdff1aSopenharmony_ci * All rights reserved.
4cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
5cabdff1aSopenharmony_ci *                Xiwei Gu   <guxiwei-hf@loongson.cn>
6cabdff1aSopenharmony_ci *                Lu Wang    <wanglu@loongson.cn>
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * This file is part of FFmpeg.
9cabdff1aSopenharmony_ci *
10cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci *
15cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci * Lesser General Public License for more details.
19cabdff1aSopenharmony_ci *
20cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci *
24cabdff1aSopenharmony_ci */
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci#ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
27cabdff1aSopenharmony_ci#define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci/*
30cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited
31cabdff1aSopenharmony_ci * All rights reserved.
32cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
33cabdff1aSopenharmony_ci *                Xiwei Gu   <guxiwei-hf@loongson.cn>
34cabdff1aSopenharmony_ci *                Lu Wang    <wanglu@loongson.cn>
35cabdff1aSopenharmony_ci *
36cabdff1aSopenharmony_ci * This file is a header file for loongarch builtin extension.
37cabdff1aSopenharmony_ci *
38cabdff1aSopenharmony_ci */
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci#ifndef LOONGSON_INTRINSICS_H
41cabdff1aSopenharmony_ci#define LOONGSON_INTRINSICS_H
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci/**
44cabdff1aSopenharmony_ci * MAJOR version: Macro usage changes.
45cabdff1aSopenharmony_ci * MINOR version: Add new functions, or bug fixes.
46cabdff1aSopenharmony_ci * MICRO version: Comment changes or implementation changes.
47cabdff1aSopenharmony_ci */
48cabdff1aSopenharmony_ci#define LSOM_VERSION_MAJOR 1
49cabdff1aSopenharmony_ci#define LSOM_VERSION_MINOR 1
50cabdff1aSopenharmony_ci#define LSOM_VERSION_MICRO 0
51cabdff1aSopenharmony_ci
52cabdff1aSopenharmony_ci#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
53cabdff1aSopenharmony_ci  {                                               \
54cabdff1aSopenharmony_ci    _OUT0 = _INS(_IN0);                           \
55cabdff1aSopenharmony_ci    _OUT1 = _INS(_IN1);                           \
56cabdff1aSopenharmony_ci  }
57cabdff1aSopenharmony_ci
58cabdff1aSopenharmony_ci#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
59cabdff1aSopenharmony_ci  {                                                           \
60cabdff1aSopenharmony_ci    _OUT0 = _INS(_IN0, _IN1);                                 \
61cabdff1aSopenharmony_ci    _OUT1 = _INS(_IN2, _IN3);                                 \
62cabdff1aSopenharmony_ci  }
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
65cabdff1aSopenharmony_ci  {                                                                       \
66cabdff1aSopenharmony_ci    _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
67cabdff1aSopenharmony_ci    _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
68cabdff1aSopenharmony_ci  }
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
71cabdff1aSopenharmony_ci  {                                                                         \
72cabdff1aSopenharmony_ci    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
73cabdff1aSopenharmony_ci    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
74cabdff1aSopenharmony_ci  }
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
77cabdff1aSopenharmony_ci                  _OUT1, _OUT2, _OUT3)                                         \
78cabdff1aSopenharmony_ci  {                                                                            \
79cabdff1aSopenharmony_ci    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
80cabdff1aSopenharmony_ci    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
81cabdff1aSopenharmony_ci  }
82cabdff1aSopenharmony_ci
83cabdff1aSopenharmony_ci#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
84cabdff1aSopenharmony_ci                  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
85cabdff1aSopenharmony_ci  {                                                                           \
86cabdff1aSopenharmony_ci    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
87cabdff1aSopenharmony_ci    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
88cabdff1aSopenharmony_ci  }
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci#ifdef __loongarch_sx
91cabdff1aSopenharmony_ci#include <lsxintrin.h>
92cabdff1aSopenharmony_ci/*
93cabdff1aSopenharmony_ci * =============================================================================
94cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements
95cabdff1aSopenharmony_ci * Arguments   : Inputs  - in_c, in_h, in_l
96cabdff1aSopenharmony_ci *               Outputs - out
97cabdff1aSopenharmony_ci *               Return Type - halfword
98cabdff1aSopenharmony_ci * Details     : Signed byte elements from in_h are multiplied by
99cabdff1aSopenharmony_ci *               signed byte elements from in_l, and then added adjacent to
100cabdff1aSopenharmony_ci *               each other to get results with the twice size of input.
101cabdff1aSopenharmony_ci *               Then the results plus to signed half-word elements from in_c.
102cabdff1aSopenharmony_ci * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
103cabdff1aSopenharmony_ci *        in_c : 1,2,3,4, 1,2,3,4
104cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
105cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
106cabdff1aSopenharmony_ci *         out : 23,40,41,26, 23,40,41,26
107cabdff1aSopenharmony_ci * =============================================================================
108cabdff1aSopenharmony_ci */
109cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
110cabdff1aSopenharmony_ci                                        __m128i in_l) {
111cabdff1aSopenharmony_ci  __m128i out;
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ci  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
114cabdff1aSopenharmony_ci  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
115cabdff1aSopenharmony_ci  return out;
116cabdff1aSopenharmony_ci}
117cabdff1aSopenharmony_ci
118cabdff1aSopenharmony_ci/*
119cabdff1aSopenharmony_ci * =============================================================================
120cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements
121cabdff1aSopenharmony_ci * Arguments   : Inputs  - in_c, in_h, in_l
122cabdff1aSopenharmony_ci *               Outputs - out
123cabdff1aSopenharmony_ci *               Return Type - halfword
124cabdff1aSopenharmony_ci * Details     : Unsigned byte elements from in_h are multiplied by
125cabdff1aSopenharmony_ci *               unsigned byte elements from in_l, and then added adjacent to
126cabdff1aSopenharmony_ci *               each other to get results with the twice size of input.
127cabdff1aSopenharmony_ci *               The results plus to signed half-word elements from in_c.
128cabdff1aSopenharmony_ci * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
129cabdff1aSopenharmony_ci *        in_c : 1,2,3,4, 1,2,3,4
130cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
131cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
132cabdff1aSopenharmony_ci *         out : 23,40,41,26, 23,40,41,26
133cabdff1aSopenharmony_ci * =============================================================================
134cabdff1aSopenharmony_ci */
135cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
136cabdff1aSopenharmony_ci                                         __m128i in_l) {
137cabdff1aSopenharmony_ci  __m128i out;
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
140cabdff1aSopenharmony_ci  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
141cabdff1aSopenharmony_ci  return out;
142cabdff1aSopenharmony_ci}
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci/*
145cabdff1aSopenharmony_ci * =============================================================================
146cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements
147cabdff1aSopenharmony_ci * Arguments   : Inputs  - in_c, in_h, in_l
148cabdff1aSopenharmony_ci *               Outputs - out
149cabdff1aSopenharmony_ci *               Return Type - halfword
150cabdff1aSopenharmony_ci * Details     : Unsigned byte elements from in_h are multiplied by
151cabdff1aSopenharmony_ci *               signed byte elements from in_l, and then added adjacent to
152cabdff1aSopenharmony_ci *               each other to get results with the twice size of input.
153cabdff1aSopenharmony_ci *               The results plus to signed half-word elements from in_c.
154cabdff1aSopenharmony_ci * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
155cabdff1aSopenharmony_ci *        in_c : 1,1,1,1, 1,1,1,1
156cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
157cabdff1aSopenharmony_ci *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
158cabdff1aSopenharmony_ci *         out : -4,-24,-60,-112, 6,26,62,114
159cabdff1aSopenharmony_ci * =============================================================================
160cabdff1aSopenharmony_ci */
161cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
162cabdff1aSopenharmony_ci                                           __m128i in_l) {
163cabdff1aSopenharmony_ci  __m128i out;
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ci  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
166cabdff1aSopenharmony_ci  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
167cabdff1aSopenharmony_ci  return out;
168cabdff1aSopenharmony_ci}
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci/*
171cabdff1aSopenharmony_ci * =============================================================================
172cabdff1aSopenharmony_ci * Description : Dot product & addition of half-word vector elements
173cabdff1aSopenharmony_ci * Arguments   : Inputs  - in_c, in_h, in_l
174cabdff1aSopenharmony_ci *               Outputs - out
175cabdff1aSopenharmony_ci *               Return Type - __m128i
176cabdff1aSopenharmony_ci * Details     : Signed half-word elements from in_h are multiplied by
177cabdff1aSopenharmony_ci *               signed half-word elements from in_l, and then added adjacent to
178cabdff1aSopenharmony_ci *               each other to get results with the twice size of input.
179cabdff1aSopenharmony_ci *               Then the results plus to signed word elements from in_c.
180cabdff1aSopenharmony_ci * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
181cabdff1aSopenharmony_ci *        in_c : 1,2,3,4
182cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8
183cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1
184cabdff1aSopenharmony_ci *         out : 23,40,41,26
185cabdff1aSopenharmony_ci * =============================================================================
186cabdff1aSopenharmony_ci */
187cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
188cabdff1aSopenharmony_ci                                        __m128i in_l) {
189cabdff1aSopenharmony_ci  __m128i out;
190cabdff1aSopenharmony_ci
191cabdff1aSopenharmony_ci  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
192cabdff1aSopenharmony_ci  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
193cabdff1aSopenharmony_ci  return out;
194cabdff1aSopenharmony_ci}
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_ci/*
197cabdff1aSopenharmony_ci * =============================================================================
198cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements
199cabdff1aSopenharmony_ci * Arguments   : Inputs  - in_h, in_l
200cabdff1aSopenharmony_ci *               Outputs - out
201cabdff1aSopenharmony_ci *               Return Type - halfword
202cabdff1aSopenharmony_ci * Details     : Signed byte elements from in_h are multiplied by
203cabdff1aSopenharmony_ci *               signed byte elements from in_l, and then added adjacent to
204cabdff1aSopenharmony_ci *               each other to get results with the twice size of input.
205cabdff1aSopenharmony_ci * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
206cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
207cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
208cabdff1aSopenharmony_ci *         out : 22,38,38,22, 22,38,38,22
209cabdff1aSopenharmony_ci * =============================================================================
210cabdff1aSopenharmony_ci */
211cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
212cabdff1aSopenharmony_ci  __m128i out;
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_ci  out = __lsx_vmulwev_h_b(in_h, in_l);
215cabdff1aSopenharmony_ci  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
216cabdff1aSopenharmony_ci  return out;
217cabdff1aSopenharmony_ci}
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_ci/*
220cabdff1aSopenharmony_ci * =============================================================================
221cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements
222cabdff1aSopenharmony_ci * Arguments   : Inputs  - in_h, in_l
223cabdff1aSopenharmony_ci *               Outputs - out
224cabdff1aSopenharmony_ci *               Return Type - halfword
225cabdff1aSopenharmony_ci * Details     : Unsigned byte elements from in_h are multiplied by
226cabdff1aSopenharmony_ci *               unsigned byte elements from in_l, and then added adjacent to
227cabdff1aSopenharmony_ci *               each other to get results with the twice size of input.
228cabdff1aSopenharmony_ci * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
229cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
230cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
231cabdff1aSopenharmony_ci *         out : 22,38,38,22, 22,38,38,22
232cabdff1aSopenharmony_ci * =============================================================================
233cabdff1aSopenharmony_ci */
234cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
235cabdff1aSopenharmony_ci  __m128i out;
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci  out = __lsx_vmulwev_h_bu(in_h, in_l);
238cabdff1aSopenharmony_ci  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
239cabdff1aSopenharmony_ci  return out;
240cabdff1aSopenharmony_ci}
241cabdff1aSopenharmony_ci
242cabdff1aSopenharmony_ci/*
243cabdff1aSopenharmony_ci * =============================================================================
244cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements
245cabdff1aSopenharmony_ci * Arguments   : Inputs  - in_h, in_l
246cabdff1aSopenharmony_ci *               Outputs - out
247cabdff1aSopenharmony_ci *               Return Type - halfword
248cabdff1aSopenharmony_ci * Details     : Unsigned byte elements from in_h are multiplied by
249cabdff1aSopenharmony_ci *               signed byte elements from in_l, and then added adjacent to
250cabdff1aSopenharmony_ci *               each other to get results with the twice size of input.
251cabdff1aSopenharmony_ci * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
252cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
253cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
254cabdff1aSopenharmony_ci *         out : 22,38,38,22, 22,38,38,6
255cabdff1aSopenharmony_ci * =============================================================================
256cabdff1aSopenharmony_ci */
257cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
258cabdff1aSopenharmony_ci  __m128i out;
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_ci  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
261cabdff1aSopenharmony_ci  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
262cabdff1aSopenharmony_ci  return out;
263cabdff1aSopenharmony_ci}
264cabdff1aSopenharmony_ci
265cabdff1aSopenharmony_ci/*
266cabdff1aSopenharmony_ci * =============================================================================
267cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements
268cabdff1aSopenharmony_ci * Arguments   : Inputs  - in_h, in_l
269cabdff1aSopenharmony_ci *               Outputs - out
270cabdff1aSopenharmony_ci *               Return Type - halfword
271cabdff1aSopenharmony_ci * Details     : Signed byte elements from in_h are multiplied by
272cabdff1aSopenharmony_ci *               signed byte elements from in_l, and then added adjacent to
273cabdff1aSopenharmony_ci *               each other to get results with the twice size of input.
274cabdff1aSopenharmony_ci * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
275cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8
276cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1
277cabdff1aSopenharmony_ci *         out : 22,38,38,22
278cabdff1aSopenharmony_ci * =============================================================================
279cabdff1aSopenharmony_ci */
280cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
281cabdff1aSopenharmony_ci  __m128i out;
282cabdff1aSopenharmony_ci
283cabdff1aSopenharmony_ci  out = __lsx_vmulwev_w_h(in_h, in_l);
284cabdff1aSopenharmony_ci  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
285cabdff1aSopenharmony_ci  return out;
286cabdff1aSopenharmony_ci}
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci/*
289cabdff1aSopenharmony_ci * =============================================================================
290cabdff1aSopenharmony_ci * Description : Clip all halfword elements of input vector between min & max
291cabdff1aSopenharmony_ci *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
292cabdff1aSopenharmony_ci *               (_in))
293cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in  (input vector)
294cabdff1aSopenharmony_ci *                       - min  (min threshold)
295cabdff1aSopenharmony_ci *                       - max  (max threshold)
296cabdff1aSopenharmony_ci *               Outputs - out  (output vector with clipped elements)
297cabdff1aSopenharmony_ci *               Return Type - signed halfword
298cabdff1aSopenharmony_ci * Example     : out = __lsx_vclip_h(_in)
299cabdff1aSopenharmony_ci *         _in : -8,2,280,249, -8,255,280,249
300cabdff1aSopenharmony_ci *         min : 1,1,1,1, 1,1,1,1
301cabdff1aSopenharmony_ci *         max : 9,9,9,9, 9,9,9,9
302cabdff1aSopenharmony_ci *         out : 1,2,9,9, 1,9,9,9
303cabdff1aSopenharmony_ci * =============================================================================
304cabdff1aSopenharmony_ci */
305cabdff1aSopenharmony_cistatic inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
306cabdff1aSopenharmony_ci  __m128i out;
307cabdff1aSopenharmony_ci
308cabdff1aSopenharmony_ci  out = __lsx_vmax_h(min, _in);
309cabdff1aSopenharmony_ci  out = __lsx_vmin_h(max, out);
310cabdff1aSopenharmony_ci  return out;
311cabdff1aSopenharmony_ci}
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci/*
314cabdff1aSopenharmony_ci * =============================================================================
315cabdff1aSopenharmony_ci * Description : Set each element of vector between 0 and 255
316cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in
317cabdff1aSopenharmony_ci *               Outputs - out
318cabdff1aSopenharmony_ci *               Return Type - halfword
319cabdff1aSopenharmony_ci * Details     : Signed byte elements from _in are clamped between 0 and 255.
320cabdff1aSopenharmony_ci * Example     : out = __lsx_vclip255_h(_in)
321cabdff1aSopenharmony_ci *         _in : -8,255,280,249, -8,255,280,249
322cabdff1aSopenharmony_ci *         out : 0,255,255,249, 0,255,255,249
323cabdff1aSopenharmony_ci * =============================================================================
324cabdff1aSopenharmony_ci */
325cabdff1aSopenharmony_cistatic inline __m128i __lsx_vclip255_h(__m128i _in) {
326cabdff1aSopenharmony_ci  __m128i out;
327cabdff1aSopenharmony_ci
328cabdff1aSopenharmony_ci  out = __lsx_vmaxi_h(_in, 0);
329cabdff1aSopenharmony_ci  out = __lsx_vsat_hu(out, 7);
330cabdff1aSopenharmony_ci  return out;
331cabdff1aSopenharmony_ci}
332cabdff1aSopenharmony_ci
333cabdff1aSopenharmony_ci/*
334cabdff1aSopenharmony_ci * =============================================================================
335cabdff1aSopenharmony_ci * Description : Set each element of vector between 0 and 255
336cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in
337cabdff1aSopenharmony_ci *               Outputs - out
338cabdff1aSopenharmony_ci *               Return Type - word
339cabdff1aSopenharmony_ci * Details     : Signed byte elements from _in are clamped between 0 and 255.
340cabdff1aSopenharmony_ci * Example     : out = __lsx_vclip255_w(_in)
341cabdff1aSopenharmony_ci *         _in : -8,255,280,249
342cabdff1aSopenharmony_ci *         out : 0,255,255,249
343cabdff1aSopenharmony_ci * =============================================================================
344cabdff1aSopenharmony_ci */
345cabdff1aSopenharmony_cistatic inline __m128i __lsx_vclip255_w(__m128i _in) {
346cabdff1aSopenharmony_ci  __m128i out;
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci  out = __lsx_vmaxi_w(_in, 0);
349cabdff1aSopenharmony_ci  out = __lsx_vsat_wu(out, 7);
350cabdff1aSopenharmony_ci  return out;
351cabdff1aSopenharmony_ci}
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ci/*
354cabdff1aSopenharmony_ci * =============================================================================
355cabdff1aSopenharmony_ci * Description : Swap two variables
356cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1
357cabdff1aSopenharmony_ci *               Outputs - _in0, _in1 (in-place)
358cabdff1aSopenharmony_ci * Details     : Swapping of two input variables using xor
359cabdff1aSopenharmony_ci * Example     : LSX_SWAP(_in0, _in1)
360cabdff1aSopenharmony_ci *        _in0 : 1,2,3,4
361cabdff1aSopenharmony_ci *        _in1 : 5,6,7,8
362cabdff1aSopenharmony_ci *   _in0(out) : 5,6,7,8
363cabdff1aSopenharmony_ci *   _in1(out) : 1,2,3,4
364cabdff1aSopenharmony_ci * =============================================================================
365cabdff1aSopenharmony_ci */
366cabdff1aSopenharmony_ci#define LSX_SWAP(_in0, _in1)         \
367cabdff1aSopenharmony_ci  {                                  \
368cabdff1aSopenharmony_ci    _in0 = __lsx_vxor_v(_in0, _in1); \
369cabdff1aSopenharmony_ci    _in1 = __lsx_vxor_v(_in0, _in1); \
370cabdff1aSopenharmony_ci    _in0 = __lsx_vxor_v(_in0, _in1); \
371cabdff1aSopenharmony_ci  }
372cabdff1aSopenharmony_ci
373cabdff1aSopenharmony_ci/*
374cabdff1aSopenharmony_ci * =============================================================================
375cabdff1aSopenharmony_ci * Description : Transpose 4x4 block with word elements in vectors
376cabdff1aSopenharmony_ci * Arguments   : Inputs  - in0, in1, in2, in3
377cabdff1aSopenharmony_ci *               Outputs - out0, out1, out2, out3
378cabdff1aSopenharmony_ci * Details     :
379cabdff1aSopenharmony_ci * Example     :
380cabdff1aSopenharmony_ci *               1, 2, 3, 4            1, 5, 9,13
381cabdff1aSopenharmony_ci *               5, 6, 7, 8    to      2, 6,10,14
382cabdff1aSopenharmony_ci *               9,10,11,12  =====>    3, 7,11,15
383cabdff1aSopenharmony_ci *              13,14,15,16            4, 8,12,16
384cabdff1aSopenharmony_ci * =============================================================================
385cabdff1aSopenharmony_ci */
386cabdff1aSopenharmony_ci#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
387cabdff1aSopenharmony_ci  {                                                                            \
388cabdff1aSopenharmony_ci    __m128i _t0, _t1, _t2, _t3;                                                \
389cabdff1aSopenharmony_ci                                                                               \
390cabdff1aSopenharmony_ci    _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
391cabdff1aSopenharmony_ci    _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
392cabdff1aSopenharmony_ci    _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
393cabdff1aSopenharmony_ci    _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
394cabdff1aSopenharmony_ci    _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
395cabdff1aSopenharmony_ci    _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
396cabdff1aSopenharmony_ci    _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
397cabdff1aSopenharmony_ci    _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
398cabdff1aSopenharmony_ci  }
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci/*
401cabdff1aSopenharmony_ci * =============================================================================
402cabdff1aSopenharmony_ci * Description : Transpose 8x8 block with byte elements in vectors
403cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
404cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
405cabdff1aSopenharmony_ci *               _out7
406cabdff1aSopenharmony_ci * Details     : The rows of the matrix become columns, and the columns
407cabdff1aSopenharmony_ci *               become rows.
408cabdff1aSopenharmony_ci * Example     : LSX_TRANSPOSE8x8_B
409cabdff1aSopenharmony_ci *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
410cabdff1aSopenharmony_ci *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
411cabdff1aSopenharmony_ci *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
412cabdff1aSopenharmony_ci *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
413cabdff1aSopenharmony_ci *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
414cabdff1aSopenharmony_ci *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
415cabdff1aSopenharmony_ci *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
416cabdff1aSopenharmony_ci *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
417cabdff1aSopenharmony_ci *
418cabdff1aSopenharmony_ci *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
419cabdff1aSopenharmony_ci *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
420cabdff1aSopenharmony_ci *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
421cabdff1aSopenharmony_ci *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
422cabdff1aSopenharmony_ci *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
423cabdff1aSopenharmony_ci *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
424cabdff1aSopenharmony_ci *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
425cabdff1aSopenharmony_ci *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
426cabdff1aSopenharmony_ci * =============================================================================
427cabdff1aSopenharmony_ci */
428cabdff1aSopenharmony_ci#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
429cabdff1aSopenharmony_ci                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
430cabdff1aSopenharmony_ci                           _out7)                                           \
431cabdff1aSopenharmony_ci  {                                                                         \
432cabdff1aSopenharmony_ci    __m128i zero = { 0 };                                                   \
433cabdff1aSopenharmony_ci    __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
434cabdff1aSopenharmony_ci    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
435cabdff1aSopenharmony_ci                                                                            \
436cabdff1aSopenharmony_ci    _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
437cabdff1aSopenharmony_ci    _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
438cabdff1aSopenharmony_ci    _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
439cabdff1aSopenharmony_ci    _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
440cabdff1aSopenharmony_ci    _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
441cabdff1aSopenharmony_ci    _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
442cabdff1aSopenharmony_ci    _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
443cabdff1aSopenharmony_ci    _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
444cabdff1aSopenharmony_ci    _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
445cabdff1aSopenharmony_ci    _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
446cabdff1aSopenharmony_ci    _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
447cabdff1aSopenharmony_ci    _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
448cabdff1aSopenharmony_ci    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
449cabdff1aSopenharmony_ci    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
450cabdff1aSopenharmony_ci    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
451cabdff1aSopenharmony_ci    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
452cabdff1aSopenharmony_ci  }
453cabdff1aSopenharmony_ci
454cabdff1aSopenharmony_ci/*
455cabdff1aSopenharmony_ci * =============================================================================
456cabdff1aSopenharmony_ci * Description : Transpose 8x8 block with half-word elements in vectors
457cabdff1aSopenharmony_ci * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
458cabdff1aSopenharmony_ci *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
459cabdff1aSopenharmony_ci * Details     :
460cabdff1aSopenharmony_ci * Example     :
461cabdff1aSopenharmony_ci *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
462cabdff1aSopenharmony_ci *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
463cabdff1aSopenharmony_ci *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
464cabdff1aSopenharmony_ci *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
465cabdff1aSopenharmony_ci *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
466cabdff1aSopenharmony_ci *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
467cabdff1aSopenharmony_ci *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
468cabdff1aSopenharmony_ci *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
469cabdff1aSopenharmony_ci * =============================================================================
470cabdff1aSopenharmony_ci */
471cabdff1aSopenharmony_ci#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
472cabdff1aSopenharmony_ci                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
473cabdff1aSopenharmony_ci                           _out7)                                           \
474cabdff1aSopenharmony_ci  {                                                                         \
475cabdff1aSopenharmony_ci    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
476cabdff1aSopenharmony_ci                                                                            \
477cabdff1aSopenharmony_ci    _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
478cabdff1aSopenharmony_ci    _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
479cabdff1aSopenharmony_ci    _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
480cabdff1aSopenharmony_ci    _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
481cabdff1aSopenharmony_ci    _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
482cabdff1aSopenharmony_ci    _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
483cabdff1aSopenharmony_ci    _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
484cabdff1aSopenharmony_ci    _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
485cabdff1aSopenharmony_ci    _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
486cabdff1aSopenharmony_ci    _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
487cabdff1aSopenharmony_ci    _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
488cabdff1aSopenharmony_ci    _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
489cabdff1aSopenharmony_ci    _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
490cabdff1aSopenharmony_ci    _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
491cabdff1aSopenharmony_ci    _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
492cabdff1aSopenharmony_ci    _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
493cabdff1aSopenharmony_ci                                                                            \
494cabdff1aSopenharmony_ci    _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
495cabdff1aSopenharmony_ci    _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
496cabdff1aSopenharmony_ci    _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
497cabdff1aSopenharmony_ci    _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
498cabdff1aSopenharmony_ci    _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
499cabdff1aSopenharmony_ci    _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
500cabdff1aSopenharmony_ci    _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
501cabdff1aSopenharmony_ci    _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
502cabdff1aSopenharmony_ci  }
503cabdff1aSopenharmony_ci
504cabdff1aSopenharmony_ci/*
505cabdff1aSopenharmony_ci * =============================================================================
506cabdff1aSopenharmony_ci * Description : Transpose input 8x4 byte block into 4x8
507cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
508cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
509cabdff1aSopenharmony_ci *               Return Type - as per RTYPE
510cabdff1aSopenharmony_ci * Details     : The rows of the matrix become columns, and the columns become
511cabdff1aSopenharmony_ci *               rows.
512cabdff1aSopenharmony_ci * Example     : LSX_TRANSPOSE8x4_B
513cabdff1aSopenharmony_ci *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
514cabdff1aSopenharmony_ci *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
515cabdff1aSopenharmony_ci *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
516cabdff1aSopenharmony_ci *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
517cabdff1aSopenharmony_ci *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
518cabdff1aSopenharmony_ci *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
519cabdff1aSopenharmony_ci *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
520cabdff1aSopenharmony_ci *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
521cabdff1aSopenharmony_ci *
522cabdff1aSopenharmony_ci *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
523cabdff1aSopenharmony_ci *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
524cabdff1aSopenharmony_ci *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
525cabdff1aSopenharmony_ci *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
526cabdff1aSopenharmony_ci * =============================================================================
527cabdff1aSopenharmony_ci */
528cabdff1aSopenharmony_ci#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
529cabdff1aSopenharmony_ci                           _out0, _out1, _out2, _out3)                     \
530cabdff1aSopenharmony_ci  {                                                                        \
531cabdff1aSopenharmony_ci    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
532cabdff1aSopenharmony_ci                                                                           \
533cabdff1aSopenharmony_ci    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
534cabdff1aSopenharmony_ci    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
535cabdff1aSopenharmony_ci    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
536cabdff1aSopenharmony_ci    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
537cabdff1aSopenharmony_ci    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
538cabdff1aSopenharmony_ci                                                                           \
539cabdff1aSopenharmony_ci    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
540cabdff1aSopenharmony_ci    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
541cabdff1aSopenharmony_ci    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
542cabdff1aSopenharmony_ci                                                                           \
543cabdff1aSopenharmony_ci    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
544cabdff1aSopenharmony_ci    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
545cabdff1aSopenharmony_ci    _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
546cabdff1aSopenharmony_ci    _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
547cabdff1aSopenharmony_ci  }
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ci/*
550cabdff1aSopenharmony_ci * =============================================================================
551cabdff1aSopenharmony_ci * Description : Transpose 16x8 block with byte elements in vectors
552cabdff1aSopenharmony_ci * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
553cabdff1aSopenharmony_ci *                         in9, in10, in11, in12, in13, in14, in15
554cabdff1aSopenharmony_ci *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
555cabdff1aSopenharmony_ci * Details     :
556cabdff1aSopenharmony_ci * Example     :
557cabdff1aSopenharmony_ci *              000,001,002,003,004,005,006,007
558cabdff1aSopenharmony_ci *              008,009,010,011,012,013,014,015
559cabdff1aSopenharmony_ci *              016,017,018,019,020,021,022,023
560cabdff1aSopenharmony_ci *              024,025,026,027,028,029,030,031
561cabdff1aSopenharmony_ci *              032,033,034,035,036,037,038,039
562cabdff1aSopenharmony_ci *              040,041,042,043,044,045,046,047        000,008,...,112,120
563cabdff1aSopenharmony_ci *              048,049,050,051,052,053,054,055        001,009,...,113,121
564cabdff1aSopenharmony_ci *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
565cabdff1aSopenharmony_ci *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
566cabdff1aSopenharmony_ci *              072,073,074,075,076,077,078,079        004,012,...,116,124
567cabdff1aSopenharmony_ci *              080,081,082,083,084,085,086,087        005,013,...,117,125
568cabdff1aSopenharmony_ci *              088,089,090,091,092,093,094,095        006,014,...,118,126
569cabdff1aSopenharmony_ci *              096,097,098,099,100,101,102,103        007,015,...,119,127
570cabdff1aSopenharmony_ci *              104,105,106,107,108,109,110,111
571cabdff1aSopenharmony_ci *              112,113,114,115,116,117,118,119
572cabdff1aSopenharmony_ci *              120,121,122,123,124,125,126,127
573cabdff1aSopenharmony_ci * =============================================================================
574cabdff1aSopenharmony_ci */
575cabdff1aSopenharmony_ci#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
576cabdff1aSopenharmony_ci                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
577cabdff1aSopenharmony_ci                            _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
578cabdff1aSopenharmony_ci                            _out6, _out7)                                    \
579cabdff1aSopenharmony_ci  {                                                                          \
580cabdff1aSopenharmony_ci    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
581cabdff1aSopenharmony_ci    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
582cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
583cabdff1aSopenharmony_ci              _tmp0, _tmp1, _tmp2, _tmp3);                                   \
584cabdff1aSopenharmony_ci    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
585cabdff1aSopenharmony_ci              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
586cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
587cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
588cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
589cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
590cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
591cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
592cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
593cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
594cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
595cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
596cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
597cabdff1aSopenharmony_ci    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
598cabdff1aSopenharmony_ci  }
599cabdff1aSopenharmony_ci
600cabdff1aSopenharmony_ci/*
601cabdff1aSopenharmony_ci * =============================================================================
602cabdff1aSopenharmony_ci * Description : Butterfly of 4 input vectors
603cabdff1aSopenharmony_ci * Arguments   : Inputs  - in0, in1, in2, in3
604cabdff1aSopenharmony_ci *               Outputs - out0, out1, out2, out3
605cabdff1aSopenharmony_ci * Details     : Butterfly operation
606cabdff1aSopenharmony_ci * Example     :
607cabdff1aSopenharmony_ci *               out0 = in0 + in3;
608cabdff1aSopenharmony_ci *               out1 = in1 + in2;
609cabdff1aSopenharmony_ci *               out2 = in1 - in2;
610cabdff1aSopenharmony_ci *               out3 = in0 - in3;
611cabdff1aSopenharmony_ci * =============================================================================
612cabdff1aSopenharmony_ci */
613cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
614cabdff1aSopenharmony_ci  {                                                                           \
615cabdff1aSopenharmony_ci    _out0 = __lsx_vadd_b(_in0, _in3);                                         \
616cabdff1aSopenharmony_ci    _out1 = __lsx_vadd_b(_in1, _in2);                                         \
617cabdff1aSopenharmony_ci    _out2 = __lsx_vsub_b(_in1, _in2);                                         \
618cabdff1aSopenharmony_ci    _out3 = __lsx_vsub_b(_in0, _in3);                                         \
619cabdff1aSopenharmony_ci  }
620cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
621cabdff1aSopenharmony_ci  {                                                                           \
622cabdff1aSopenharmony_ci    _out0 = __lsx_vadd_h(_in0, _in3);                                         \
623cabdff1aSopenharmony_ci    _out1 = __lsx_vadd_h(_in1, _in2);                                         \
624cabdff1aSopenharmony_ci    _out2 = __lsx_vsub_h(_in1, _in2);                                         \
625cabdff1aSopenharmony_ci    _out3 = __lsx_vsub_h(_in0, _in3);                                         \
626cabdff1aSopenharmony_ci  }
627cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
628cabdff1aSopenharmony_ci  {                                                                           \
629cabdff1aSopenharmony_ci    _out0 = __lsx_vadd_w(_in0, _in3);                                         \
630cabdff1aSopenharmony_ci    _out1 = __lsx_vadd_w(_in1, _in2);                                         \
631cabdff1aSopenharmony_ci    _out2 = __lsx_vsub_w(_in1, _in2);                                         \
632cabdff1aSopenharmony_ci    _out3 = __lsx_vsub_w(_in0, _in3);                                         \
633cabdff1aSopenharmony_ci  }
634cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
635cabdff1aSopenharmony_ci  {                                                                           \
636cabdff1aSopenharmony_ci    _out0 = __lsx_vadd_d(_in0, _in3);                                         \
637cabdff1aSopenharmony_ci    _out1 = __lsx_vadd_d(_in1, _in2);                                         \
638cabdff1aSopenharmony_ci    _out2 = __lsx_vsub_d(_in1, _in2);                                         \
639cabdff1aSopenharmony_ci    _out3 = __lsx_vsub_d(_in0, _in3);                                         \
640cabdff1aSopenharmony_ci  }
641cabdff1aSopenharmony_ci
642cabdff1aSopenharmony_ci/*
643cabdff1aSopenharmony_ci * =============================================================================
644cabdff1aSopenharmony_ci * Description : Butterfly of 8 input vectors
645cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
646cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3, ~
647cabdff1aSopenharmony_ci * Details     : Butterfly operation
648cabdff1aSopenharmony_ci * Example     :
649cabdff1aSopenharmony_ci *              _out0 = _in0 + _in7;
650cabdff1aSopenharmony_ci *              _out1 = _in1 + _in6;
651cabdff1aSopenharmony_ci *              _out2 = _in2 + _in5;
652cabdff1aSopenharmony_ci *              _out3 = _in3 + _in4;
653cabdff1aSopenharmony_ci *              _out4 = _in3 - _in4;
654cabdff1aSopenharmony_ci *              _out5 = _in2 - _in5;
655cabdff1aSopenharmony_ci *              _out6 = _in1 - _in6;
656cabdff1aSopenharmony_ci *              _out7 = _in0 - _in7;
657cabdff1aSopenharmony_ci * =============================================================================
658cabdff1aSopenharmony_ci */
659cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
660cabdff1aSopenharmony_ci                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
661cabdff1aSopenharmony_ci                          _out7)                                           \
662cabdff1aSopenharmony_ci  {                                                                        \
663cabdff1aSopenharmony_ci    _out0 = __lsx_vadd_b(_in0, _in7);                                      \
664cabdff1aSopenharmony_ci    _out1 = __lsx_vadd_b(_in1, _in6);                                      \
665cabdff1aSopenharmony_ci    _out2 = __lsx_vadd_b(_in2, _in5);                                      \
666cabdff1aSopenharmony_ci    _out3 = __lsx_vadd_b(_in3, _in4);                                      \
667cabdff1aSopenharmony_ci    _out4 = __lsx_vsub_b(_in3, _in4);                                      \
668cabdff1aSopenharmony_ci    _out5 = __lsx_vsub_b(_in2, _in5);                                      \
669cabdff1aSopenharmony_ci    _out6 = __lsx_vsub_b(_in1, _in6);                                      \
670cabdff1aSopenharmony_ci    _out7 = __lsx_vsub_b(_in0, _in7);                                      \
671cabdff1aSopenharmony_ci  }
672cabdff1aSopenharmony_ci
673cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
674cabdff1aSopenharmony_ci                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
675cabdff1aSopenharmony_ci                          _out7)                                           \
676cabdff1aSopenharmony_ci  {                                                                        \
677cabdff1aSopenharmony_ci    _out0 = __lsx_vadd_h(_in0, _in7);                                      \
678cabdff1aSopenharmony_ci    _out1 = __lsx_vadd_h(_in1, _in6);                                      \
679cabdff1aSopenharmony_ci    _out2 = __lsx_vadd_h(_in2, _in5);                                      \
680cabdff1aSopenharmony_ci    _out3 = __lsx_vadd_h(_in3, _in4);                                      \
681cabdff1aSopenharmony_ci    _out4 = __lsx_vsub_h(_in3, _in4);                                      \
682cabdff1aSopenharmony_ci    _out5 = __lsx_vsub_h(_in2, _in5);                                      \
683cabdff1aSopenharmony_ci    _out6 = __lsx_vsub_h(_in1, _in6);                                      \
684cabdff1aSopenharmony_ci    _out7 = __lsx_vsub_h(_in0, _in7);                                      \
685cabdff1aSopenharmony_ci  }
686cabdff1aSopenharmony_ci
687cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
688cabdff1aSopenharmony_ci                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
689cabdff1aSopenharmony_ci                          _out7)                                           \
690cabdff1aSopenharmony_ci  {                                                                        \
691cabdff1aSopenharmony_ci    _out0 = __lsx_vadd_w(_in0, _in7);                                      \
692cabdff1aSopenharmony_ci    _out1 = __lsx_vadd_w(_in1, _in6);                                      \
693cabdff1aSopenharmony_ci    _out2 = __lsx_vadd_w(_in2, _in5);                                      \
694cabdff1aSopenharmony_ci    _out3 = __lsx_vadd_w(_in3, _in4);                                      \
695cabdff1aSopenharmony_ci    _out4 = __lsx_vsub_w(_in3, _in4);                                      \
696cabdff1aSopenharmony_ci    _out5 = __lsx_vsub_w(_in2, _in5);                                      \
697cabdff1aSopenharmony_ci    _out6 = __lsx_vsub_w(_in1, _in6);                                      \
698cabdff1aSopenharmony_ci    _out7 = __lsx_vsub_w(_in0, _in7);                                      \
699cabdff1aSopenharmony_ci  }
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
702cabdff1aSopenharmony_ci                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
703cabdff1aSopenharmony_ci                          _out7)                                           \
704cabdff1aSopenharmony_ci  {                                                                        \
705cabdff1aSopenharmony_ci    _out0 = __lsx_vadd_d(_in0, _in7);                                      \
706cabdff1aSopenharmony_ci    _out1 = __lsx_vadd_d(_in1, _in6);                                      \
707cabdff1aSopenharmony_ci    _out2 = __lsx_vadd_d(_in2, _in5);                                      \
708cabdff1aSopenharmony_ci    _out3 = __lsx_vadd_d(_in3, _in4);                                      \
709cabdff1aSopenharmony_ci    _out4 = __lsx_vsub_d(_in3, _in4);                                      \
710cabdff1aSopenharmony_ci    _out5 = __lsx_vsub_d(_in2, _in5);                                      \
711cabdff1aSopenharmony_ci    _out6 = __lsx_vsub_d(_in1, _in6);                                      \
712cabdff1aSopenharmony_ci    _out7 = __lsx_vsub_d(_in0, _in7);                                      \
713cabdff1aSopenharmony_ci  }
714cabdff1aSopenharmony_ci
715cabdff1aSopenharmony_ci#endif  // LSX
716cabdff1aSopenharmony_ci
717cabdff1aSopenharmony_ci#ifdef __loongarch_asx
718cabdff1aSopenharmony_ci#include <lasxintrin.h>
719cabdff1aSopenharmony_ci/*
720cabdff1aSopenharmony_ci * =============================================================================
721cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements
722cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
723cabdff1aSopenharmony_ci *               Output - out
724cabdff1aSopenharmony_ci *               Return Type - signed halfword
725cabdff1aSopenharmony_ci * Details     : Unsigned byte elements from in_h are multiplied with
726cabdff1aSopenharmony_ci *               unsigned byte elements from in_l producing a result
727cabdff1aSopenharmony_ci *               twice the size of input i.e. signed halfword.
728cabdff1aSopenharmony_ci *               Then this multiplied results of adjacent odd-even elements
729cabdff1aSopenharmony_ci *               are added to the out vector
730cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
731cabdff1aSopenharmony_ci * =============================================================================
732cabdff1aSopenharmony_ci */
733cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
734cabdff1aSopenharmony_ci  __m256i out;
735cabdff1aSopenharmony_ci
736cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_h_bu(in_h, in_l);
737cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
738cabdff1aSopenharmony_ci  return out;
739cabdff1aSopenharmony_ci}
740cabdff1aSopenharmony_ci
741cabdff1aSopenharmony_ci/*
742cabdff1aSopenharmony_ci * =============================================================================
743cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements
744cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
745cabdff1aSopenharmony_ci *               Output - out
746cabdff1aSopenharmony_ci *               Return Type - signed halfword
747cabdff1aSopenharmony_ci * Details     : Signed byte elements from in_h are multiplied with
748cabdff1aSopenharmony_ci *               signed byte elements from in_l producing a result
749cabdff1aSopenharmony_ci *               twice the size of input i.e. signed halfword.
750cabdff1aSopenharmony_ci *               Then this multiplication results of adjacent odd-even elements
751cabdff1aSopenharmony_ci *               are added to the out vector
752cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
753cabdff1aSopenharmony_ci * =============================================================================
754cabdff1aSopenharmony_ci */
755cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
756cabdff1aSopenharmony_ci  __m256i out;
757cabdff1aSopenharmony_ci
758cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_h_b(in_h, in_l);
759cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
760cabdff1aSopenharmony_ci  return out;
761cabdff1aSopenharmony_ci}
762cabdff1aSopenharmony_ci
763cabdff1aSopenharmony_ci/*
764cabdff1aSopenharmony_ci * =============================================================================
765cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements
766cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
767cabdff1aSopenharmony_ci *               Output - out
768cabdff1aSopenharmony_ci *               Return Type - signed word
769cabdff1aSopenharmony_ci * Details     : Signed halfword elements from in_h are multiplied with
770cabdff1aSopenharmony_ci *               signed halfword elements from in_l producing a result
771cabdff1aSopenharmony_ci *               twice the size of input i.e. signed word.
772cabdff1aSopenharmony_ci *               Then this multiplied results of adjacent odd-even elements
773cabdff1aSopenharmony_ci *               are added to the out vector.
774cabdff1aSopenharmony_ci * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
775cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
776cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
777cabdff1aSopenharmony_ci *         out : 22,38,38,22, 22,38,38,22
778cabdff1aSopenharmony_ci * =============================================================================
779cabdff1aSopenharmony_ci */
780cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
781cabdff1aSopenharmony_ci  __m256i out;
782cabdff1aSopenharmony_ci
783cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_w_h(in_h, in_l);
784cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
785cabdff1aSopenharmony_ci  return out;
786cabdff1aSopenharmony_ci}
787cabdff1aSopenharmony_ci
788cabdff1aSopenharmony_ci/*
789cabdff1aSopenharmony_ci * =============================================================================
790cabdff1aSopenharmony_ci * Description : Dot product of word vector elements
791cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
792cabdff1aSopenharmony_ci *               Output - out
793cabdff1aSopenharmony_ci *               Return Type - signed double
794cabdff1aSopenharmony_ci * Details     : Signed word elements from in_h are multiplied with
795cabdff1aSopenharmony_ci *               signed word elements from in_l producing a result
796cabdff1aSopenharmony_ci *               twice the size of input i.e. signed double-word.
797cabdff1aSopenharmony_ci *               Then this multiplied results of adjacent odd-even elements
798cabdff1aSopenharmony_ci *               are added to the out vector.
799cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
800cabdff1aSopenharmony_ci * =============================================================================
801cabdff1aSopenharmony_ci */
802cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
803cabdff1aSopenharmony_ci  __m256i out;
804cabdff1aSopenharmony_ci
805cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_d_w(in_h, in_l);
806cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
807cabdff1aSopenharmony_ci  return out;
808cabdff1aSopenharmony_ci}
809cabdff1aSopenharmony_ci
810cabdff1aSopenharmony_ci/*
811cabdff1aSopenharmony_ci * =============================================================================
812cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements
813cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
814cabdff1aSopenharmony_ci *               Output - out
815cabdff1aSopenharmony_ci *               Return Type - signed word
816cabdff1aSopenharmony_ci * Details     : Unsigned halfword elements from in_h are multiplied with
817cabdff1aSopenharmony_ci *               signed halfword elements from in_l producing a result
818cabdff1aSopenharmony_ci *               twice the size of input i.e. unsigned word.
819cabdff1aSopenharmony_ci *               Multiplication result of adjacent odd-even elements
820cabdff1aSopenharmony_ci *               are added to the out vector
821cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
822cabdff1aSopenharmony_ci * =============================================================================
823cabdff1aSopenharmony_ci */
824cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
825cabdff1aSopenharmony_ci  __m256i out;
826cabdff1aSopenharmony_ci
827cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
828cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
829cabdff1aSopenharmony_ci  return out;
830cabdff1aSopenharmony_ci}
831cabdff1aSopenharmony_ci
832cabdff1aSopenharmony_ci/*
833cabdff1aSopenharmony_ci * =============================================================================
834cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements
835cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
836cabdff1aSopenharmony_ci *               Output - out
837cabdff1aSopenharmony_ci *               Return Type - halfword
838cabdff1aSopenharmony_ci * Details     : Signed byte elements from in_h are multiplied with
839cabdff1aSopenharmony_ci *               signed byte elements from in_l producing a result
840cabdff1aSopenharmony_ci *               twice the size of input i.e. signed halfword.
841cabdff1aSopenharmony_ci *               Then this multiplied results of adjacent odd-even elements
842cabdff1aSopenharmony_ci *               are added to the in_c vector.
843cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
844cabdff1aSopenharmony_ci * =============================================================================
845cabdff1aSopenharmony_ci */
846cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
847cabdff1aSopenharmony_ci                                          __m256i in_l) {
848cabdff1aSopenharmony_ci  __m256i out;
849cabdff1aSopenharmony_ci
850cabdff1aSopenharmony_ci  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
851cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
852cabdff1aSopenharmony_ci  return out;
853cabdff1aSopenharmony_ci}
854cabdff1aSopenharmony_ci
855cabdff1aSopenharmony_ci/*
856cabdff1aSopenharmony_ci * =============================================================================
857cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements
858cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
859cabdff1aSopenharmony_ci *               Output - out
860cabdff1aSopenharmony_ci *               Return Type - halfword
861cabdff1aSopenharmony_ci * Details     : Unsigned byte elements from in_h are multiplied with
862cabdff1aSopenharmony_ci *               unsigned byte elements from in_l producing a result
863cabdff1aSopenharmony_ci *               twice the size of input i.e. signed halfword.
864cabdff1aSopenharmony_ci *               Then this multiplied results of adjacent odd-even elements
865cabdff1aSopenharmony_ci *               are added to the in_c vector.
866cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
867cabdff1aSopenharmony_ci * =============================================================================
868cabdff1aSopenharmony_ci */
869cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
870cabdff1aSopenharmony_ci                                           __m256i in_l) {
871cabdff1aSopenharmony_ci  __m256i out;
872cabdff1aSopenharmony_ci
873cabdff1aSopenharmony_ci  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
874cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
875cabdff1aSopenharmony_ci  return out;
876cabdff1aSopenharmony_ci}
877cabdff1aSopenharmony_ci
878cabdff1aSopenharmony_ci/*
879cabdff1aSopenharmony_ci * =============================================================================
880cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements
881cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
882cabdff1aSopenharmony_ci *               Output - out
883cabdff1aSopenharmony_ci *               Return Type - halfword
884cabdff1aSopenharmony_ci * Details     : Unsigned byte elements from in_h are multiplied with
885cabdff1aSopenharmony_ci *               signed byte elements from in_l producing a result
886cabdff1aSopenharmony_ci *               twice the size of input i.e. signed halfword.
887cabdff1aSopenharmony_ci *               Then this multiplied results of adjacent odd-even elements
888cabdff1aSopenharmony_ci *               are added to the in_c vector.
889cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
890cabdff1aSopenharmony_ci * =============================================================================
891cabdff1aSopenharmony_ci */
892cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
893cabdff1aSopenharmony_ci                                             __m256i in_l) {
894cabdff1aSopenharmony_ci  __m256i out;
895cabdff1aSopenharmony_ci
896cabdff1aSopenharmony_ci  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
897cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
898cabdff1aSopenharmony_ci  return out;
899cabdff1aSopenharmony_ci}
900cabdff1aSopenharmony_ci
901cabdff1aSopenharmony_ci/*
902cabdff1aSopenharmony_ci * =============================================================================
903cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements
904cabdff1aSopenharmony_ci * Arguments   : Inputs - in_c, in_h, in_l
905cabdff1aSopenharmony_ci *               Output - out
906cabdff1aSopenharmony_ci *               Return Type - per RTYPE
907cabdff1aSopenharmony_ci * Details     : Signed halfword elements from in_h are multiplied with
908cabdff1aSopenharmony_ci *               signed halfword elements from in_l producing a result
909cabdff1aSopenharmony_ci *               twice the size of input i.e. signed word.
910cabdff1aSopenharmony_ci *               Multiplication result of adjacent odd-even elements
911cabdff1aSopenharmony_ci *               are added to the in_c vector.
912cabdff1aSopenharmony_ci * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
913cabdff1aSopenharmony_ci *        in_c : 1,2,3,4, 1,2,3,4
914cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
915cabdff1aSopenharmony_ci *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
916cabdff1aSopenharmony_ci *         out : 23,40,41,26, 23,40,41,26
917cabdff1aSopenharmony_ci * =============================================================================
918cabdff1aSopenharmony_ci */
919cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
920cabdff1aSopenharmony_ci                                          __m256i in_l) {
921cabdff1aSopenharmony_ci  __m256i out;
922cabdff1aSopenharmony_ci
923cabdff1aSopenharmony_ci  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
924cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
925cabdff1aSopenharmony_ci  return out;
926cabdff1aSopenharmony_ci}
927cabdff1aSopenharmony_ci
928cabdff1aSopenharmony_ci/*
929cabdff1aSopenharmony_ci * =============================================================================
930cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements
931cabdff1aSopenharmony_ci * Arguments   : Inputs - in_c, in_h, in_l
932cabdff1aSopenharmony_ci *               Output - out
933cabdff1aSopenharmony_ci *               Return Type - signed word
934cabdff1aSopenharmony_ci * Details     : Unsigned halfword elements from in_h are multiplied with
935cabdff1aSopenharmony_ci *               unsigned halfword elements from in_l producing a result
936cabdff1aSopenharmony_ci *               twice the size of input i.e. signed word.
937cabdff1aSopenharmony_ci *               Multiplication result of adjacent odd-even elements
938cabdff1aSopenharmony_ci *               are added to the in_c vector.
939cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
940cabdff1aSopenharmony_ci * =============================================================================
941cabdff1aSopenharmony_ci */
942cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
943cabdff1aSopenharmony_ci                                           __m256i in_l) {
944cabdff1aSopenharmony_ci  __m256i out;
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
947cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
948cabdff1aSopenharmony_ci  return out;
949cabdff1aSopenharmony_ci}
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_ci/*
952cabdff1aSopenharmony_ci * =============================================================================
953cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements
954cabdff1aSopenharmony_ci * Arguments   : Inputs - in_c, in_h, in_l
955cabdff1aSopenharmony_ci *               Output - out
956cabdff1aSopenharmony_ci *               Return Type - signed word
957cabdff1aSopenharmony_ci * Details     : Unsigned halfword elements from in_h are multiplied with
958cabdff1aSopenharmony_ci *               signed halfword elements from in_l producing a result
959cabdff1aSopenharmony_ci *               twice the size of input i.e. signed word.
960cabdff1aSopenharmony_ci *               Multiplication result of adjacent odd-even elements
961cabdff1aSopenharmony_ci *               are added to the in_c vector
962cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
963cabdff1aSopenharmony_ci * =============================================================================
964cabdff1aSopenharmony_ci */
965cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
966cabdff1aSopenharmony_ci                                             __m256i in_l) {
967cabdff1aSopenharmony_ci  __m256i out;
968cabdff1aSopenharmony_ci
969cabdff1aSopenharmony_ci  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
970cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
971cabdff1aSopenharmony_ci  return out;
972cabdff1aSopenharmony_ci}
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_ci/*
975cabdff1aSopenharmony_ci * =============================================================================
976cabdff1aSopenharmony_ci * Description : Vector Unsigned Dot Product and Subtract
977cabdff1aSopenharmony_ci * Arguments   : Inputs - in_c, in_h, in_l
978cabdff1aSopenharmony_ci *               Output - out
979cabdff1aSopenharmony_ci *               Return Type - signed halfword
980cabdff1aSopenharmony_ci * Details     : Unsigned byte elements from in_h are multiplied with
981cabdff1aSopenharmony_ci *               unsigned byte elements from in_l producing a result
982cabdff1aSopenharmony_ci *               twice the size of input i.e. signed halfword.
983cabdff1aSopenharmony_ci *               Multiplication result of adjacent odd-even elements
984cabdff1aSopenharmony_ci *               are added together and subtracted from double width elements
985cabdff1aSopenharmony_ci *               in_c vector.
986cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
987cabdff1aSopenharmony_ci * =============================================================================
988cabdff1aSopenharmony_ci */
989cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
990cabdff1aSopenharmony_ci                                           __m256i in_l) {
991cabdff1aSopenharmony_ci  __m256i out;
992cabdff1aSopenharmony_ci
993cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_h_bu(in_h, in_l);
994cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
995cabdff1aSopenharmony_ci  out = __lasx_xvsub_h(in_c, out);
996cabdff1aSopenharmony_ci  return out;
997cabdff1aSopenharmony_ci}
998cabdff1aSopenharmony_ci
999cabdff1aSopenharmony_ci/*
1000cabdff1aSopenharmony_ci * =============================================================================
1001cabdff1aSopenharmony_ci * Description : Vector Signed Dot Product and Subtract
1002cabdff1aSopenharmony_ci * Arguments   : Inputs - in_c, in_h, in_l
1003cabdff1aSopenharmony_ci *               Output - out
1004cabdff1aSopenharmony_ci *               Return Type - signed word
1005cabdff1aSopenharmony_ci * Details     : Signed halfword elements from in_h are multiplied with
1006cabdff1aSopenharmony_ci *               Signed halfword elements from in_l producing a result
1007cabdff1aSopenharmony_ci *               twice the size of input i.e. signed word.
1008cabdff1aSopenharmony_ci *               Multiplication result of adjacent odd-even elements
1009cabdff1aSopenharmony_ci *               are added together and subtracted from double width elements
1010cabdff1aSopenharmony_ci *               in_c vector.
1011cabdff1aSopenharmony_ci * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1012cabdff1aSopenharmony_ci *        in_c : 0,0,0,0, 0,0,0,0
1013cabdff1aSopenharmony_ci *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1014cabdff1aSopenharmony_ci *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1015cabdff1aSopenharmony_ci *         out : -7,-3,0,0, 0,-1,0,-1
1016cabdff1aSopenharmony_ci * =============================================================================
1017cabdff1aSopenharmony_ci */
1018cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
1019cabdff1aSopenharmony_ci                                          __m256i in_l) {
1020cabdff1aSopenharmony_ci  __m256i out;
1021cabdff1aSopenharmony_ci
1022cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_w_h(in_h, in_l);
1023cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1024cabdff1aSopenharmony_ci  out = __lasx_xvsub_w(in_c, out);
1025cabdff1aSopenharmony_ci  return out;
1026cabdff1aSopenharmony_ci}
1027cabdff1aSopenharmony_ci
1028cabdff1aSopenharmony_ci/*
1029cabdff1aSopenharmony_ci * =============================================================================
1030cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements
1031cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1032cabdff1aSopenharmony_ci *               Output - out
1033cabdff1aSopenharmony_ci *               Return Type - signed word
1034cabdff1aSopenharmony_ci * Details     : Signed halfword elements from in_h are multiplied with
1035cabdff1aSopenharmony_ci *               signed halfword elements from in_l producing a result
1036cabdff1aSopenharmony_ci *               four times the size of input i.e. signed doubleword.
1037cabdff1aSopenharmony_ci *               Then this multiplication results of four adjacent elements
1038cabdff1aSopenharmony_ci *               are added together and stored to the out vector.
1039cabdff1aSopenharmony_ci * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
1040cabdff1aSopenharmony_ci *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1041cabdff1aSopenharmony_ci *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1042cabdff1aSopenharmony_ci *         out : -2,0,1,1
1043cabdff1aSopenharmony_ci * =============================================================================
1044cabdff1aSopenharmony_ci */
1045cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1046cabdff1aSopenharmony_ci  __m256i out;
1047cabdff1aSopenharmony_ci
1048cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_w_h(in_h, in_l);
1049cabdff1aSopenharmony_ci  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1050cabdff1aSopenharmony_ci  out = __lasx_xvhaddw_d_w(out, out);
1051cabdff1aSopenharmony_ci  return out;
1052cabdff1aSopenharmony_ci}
1053cabdff1aSopenharmony_ci
1054cabdff1aSopenharmony_ci/*
1055cabdff1aSopenharmony_ci * =============================================================================
1056cabdff1aSopenharmony_ci * Description : The high half of the vector elements are expanded and
1057cabdff1aSopenharmony_ci *               added after being doubled.
1058cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1059cabdff1aSopenharmony_ci *               Output - out
1060cabdff1aSopenharmony_ci * Details     : The in_h vector and the in_l vector are added after the
1061cabdff1aSopenharmony_ci *               higher half of the two-fold sign extension (signed byte
1062cabdff1aSopenharmony_ci *               to signed halfword) and stored to the out vector.
1063cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1064cabdff1aSopenharmony_ci * =============================================================================
1065cabdff1aSopenharmony_ci */
1066cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1067cabdff1aSopenharmony_ci  __m256i out;
1068cabdff1aSopenharmony_ci
1069cabdff1aSopenharmony_ci  out = __lasx_xvilvh_b(in_h, in_l);
1070cabdff1aSopenharmony_ci  out = __lasx_xvhaddw_h_b(out, out);
1071cabdff1aSopenharmony_ci  return out;
1072cabdff1aSopenharmony_ci}
1073cabdff1aSopenharmony_ci
1074cabdff1aSopenharmony_ci/*
1075cabdff1aSopenharmony_ci * =============================================================================
1076cabdff1aSopenharmony_ci * Description : The high half of the vector elements are expanded and
1077cabdff1aSopenharmony_ci *               added after being doubled.
1078cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1079cabdff1aSopenharmony_ci *               Output - out
1080cabdff1aSopenharmony_ci * Details     : The in_h vector and the in_l vector are added after the
1081cabdff1aSopenharmony_ci *               higher half of the two-fold sign extension (signed halfword
1082cabdff1aSopenharmony_ci *               to signed word) and stored to the out vector.
1083cabdff1aSopenharmony_ci * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
1084cabdff1aSopenharmony_ci *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1085cabdff1aSopenharmony_ci *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1086cabdff1aSopenharmony_ci *         out : 1,0,0,-1, 1,0,0, 2
1087cabdff1aSopenharmony_ci * =============================================================================
1088cabdff1aSopenharmony_ci */
1089cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1090cabdff1aSopenharmony_ci  __m256i out;
1091cabdff1aSopenharmony_ci
1092cabdff1aSopenharmony_ci  out = __lasx_xvilvh_h(in_h, in_l);
1093cabdff1aSopenharmony_ci  out = __lasx_xvhaddw_w_h(out, out);
1094cabdff1aSopenharmony_ci  return out;
1095cabdff1aSopenharmony_ci}
1096cabdff1aSopenharmony_ci
1097cabdff1aSopenharmony_ci/*
1098cabdff1aSopenharmony_ci * =============================================================================
1099cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and
1100cabdff1aSopenharmony_ci *               added after being doubled.
1101cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1102cabdff1aSopenharmony_ci *               Output - out
1103cabdff1aSopenharmony_ci * Details     : The in_h vector and the in_l vector are added after the
1104cabdff1aSopenharmony_ci *               lower half of the two-fold sign extension (signed byte
1105cabdff1aSopenharmony_ci *               to signed halfword) and stored to the out vector.
1106cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1107cabdff1aSopenharmony_ci * =============================================================================
1108cabdff1aSopenharmony_ci */
1109cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1110cabdff1aSopenharmony_ci  __m256i out;
1111cabdff1aSopenharmony_ci
1112cabdff1aSopenharmony_ci  out = __lasx_xvilvl_b(in_h, in_l);
1113cabdff1aSopenharmony_ci  out = __lasx_xvhaddw_h_b(out, out);
1114cabdff1aSopenharmony_ci  return out;
1115cabdff1aSopenharmony_ci}
1116cabdff1aSopenharmony_ci
1117cabdff1aSopenharmony_ci/*
1118cabdff1aSopenharmony_ci * =============================================================================
1119cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and
1120cabdff1aSopenharmony_ci *               added after being doubled.
1121cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1122cabdff1aSopenharmony_ci *               Output - out
1123cabdff1aSopenharmony_ci * Details     : The in_h vector and the in_l vector are added after the
1124cabdff1aSopenharmony_ci *               lower half of the two-fold sign extension (signed halfword
1125cabdff1aSopenharmony_ci *               to signed word) and stored to the out vector.
1126cabdff1aSopenharmony_ci * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
1127cabdff1aSopenharmony_ci *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1128cabdff1aSopenharmony_ci *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1129cabdff1aSopenharmony_ci *         out : 5,-1,4,2, 1,0,2,-1
1130cabdff1aSopenharmony_ci * =============================================================================
1131cabdff1aSopenharmony_ci */
1132cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1133cabdff1aSopenharmony_ci  __m256i out;
1134cabdff1aSopenharmony_ci
1135cabdff1aSopenharmony_ci  out = __lasx_xvilvl_h(in_h, in_l);
1136cabdff1aSopenharmony_ci  out = __lasx_xvhaddw_w_h(out, out);
1137cabdff1aSopenharmony_ci  return out;
1138cabdff1aSopenharmony_ci}
1139cabdff1aSopenharmony_ci
1140cabdff1aSopenharmony_ci/*
1141cabdff1aSopenharmony_ci * =============================================================================
1142cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and
1143cabdff1aSopenharmony_ci *               added after being doubled.
1144cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1145cabdff1aSopenharmony_ci *               Output - out
1146cabdff1aSopenharmony_ci * Details     : The out vector and the out vector are added after the
1147cabdff1aSopenharmony_ci *               lower half of the two-fold zero extension (unsigned byte
1148cabdff1aSopenharmony_ci *               to unsigned halfword) and stored to the out vector.
1149cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1150cabdff1aSopenharmony_ci * =============================================================================
1151cabdff1aSopenharmony_ci */
1152cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1153cabdff1aSopenharmony_ci  __m256i out;
1154cabdff1aSopenharmony_ci
1155cabdff1aSopenharmony_ci  out = __lasx_xvilvl_b(in_h, in_l);
1156cabdff1aSopenharmony_ci  out = __lasx_xvhaddw_hu_bu(out, out);
1157cabdff1aSopenharmony_ci  return out;
1158cabdff1aSopenharmony_ci}
1159cabdff1aSopenharmony_ci
1160cabdff1aSopenharmony_ci/*
1161cabdff1aSopenharmony_ci * =============================================================================
1162cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and
1163cabdff1aSopenharmony_ci *               added after being doubled.
1164cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1165cabdff1aSopenharmony_ci *               Output - out
1166cabdff1aSopenharmony_ci * Details     : The in_l vector after double zero extension (unsigned byte to
1167cabdff1aSopenharmony_ci *               signed halfword),added to the in_h vector.
1168cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1169cabdff1aSopenharmony_ci * =============================================================================
1170cabdff1aSopenharmony_ci */
1171cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1172cabdff1aSopenharmony_ci  __m256i out;
1173cabdff1aSopenharmony_ci
1174cabdff1aSopenharmony_ci  out = __lasx_xvsllwil_hu_bu(in_l, 0);
1175cabdff1aSopenharmony_ci  out = __lasx_xvadd_h(in_h, out);
1176cabdff1aSopenharmony_ci  return out;
1177cabdff1aSopenharmony_ci}
1178cabdff1aSopenharmony_ci
1179cabdff1aSopenharmony_ci/*
1180cabdff1aSopenharmony_ci * =============================================================================
1181cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and
1182cabdff1aSopenharmony_ci *               added after being doubled.
1183cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1184cabdff1aSopenharmony_ci *               Output - out
1185cabdff1aSopenharmony_ci * Details     : The in_l vector after double sign extension (signed halfword to
1186cabdff1aSopenharmony_ci *               signed word), added to the in_h vector.
1187cabdff1aSopenharmony_ci * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1188cabdff1aSopenharmony_ci *        in_h : 0, 1,0,0, -1,0,0,1,
1189cabdff1aSopenharmony_ci *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
1190cabdff1aSopenharmony_ci *         out : 2, 0,1,2, -1,0,1,1,
1191cabdff1aSopenharmony_ci * =============================================================================
1192cabdff1aSopenharmony_ci */
1193cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1194cabdff1aSopenharmony_ci  __m256i out;
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci  out = __lasx_xvsllwil_w_h(in_l, 0);
1197cabdff1aSopenharmony_ci  out = __lasx_xvadd_w(in_h, out);
1198cabdff1aSopenharmony_ci  return out;
1199cabdff1aSopenharmony_ci}
1200cabdff1aSopenharmony_ci
1201cabdff1aSopenharmony_ci/*
1202cabdff1aSopenharmony_ci * =============================================================================
1203cabdff1aSopenharmony_ci * Description : Multiplication and addition calculation after expansion
1204cabdff1aSopenharmony_ci *               of the lower half of the vector.
1205cabdff1aSopenharmony_ci * Arguments   : Inputs - in_c, in_h, in_l
1206cabdff1aSopenharmony_ci *               Output - out
1207cabdff1aSopenharmony_ci * Details     : The in_h vector and the in_l vector are multiplied after
1208cabdff1aSopenharmony_ci *               the lower half of the two-fold sign extension (signed halfword
1209cabdff1aSopenharmony_ci *               to signed word), and the result is added to the vector in_c,
1210cabdff1aSopenharmony_ci *               then stored to the out vector.
1211cabdff1aSopenharmony_ci * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1212cabdff1aSopenharmony_ci *        in_c : 1,2,3,4, 5,6,7,8
1213cabdff1aSopenharmony_ci *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1214cabdff1aSopenharmony_ci *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
1215cabdff1aSopenharmony_ci *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
1216cabdff1aSopenharmony_ci *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1217cabdff1aSopenharmony_ci * =============================================================================
1218cabdff1aSopenharmony_ci */
1219cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1220cabdff1aSopenharmony_ci                                          __m256i in_l) {
1221cabdff1aSopenharmony_ci  __m256i tmp0, tmp1, out;
1222cabdff1aSopenharmony_ci
1223cabdff1aSopenharmony_ci  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1224cabdff1aSopenharmony_ci  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1225cabdff1aSopenharmony_ci  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1226cabdff1aSopenharmony_ci  out = __lasx_xvadd_w(tmp0, in_c);
1227cabdff1aSopenharmony_ci  return out;
1228cabdff1aSopenharmony_ci}
1229cabdff1aSopenharmony_ci
1230cabdff1aSopenharmony_ci/*
1231cabdff1aSopenharmony_ci * =============================================================================
1232cabdff1aSopenharmony_ci * Description : Multiplication and addition calculation after expansion
1233cabdff1aSopenharmony_ci *               of the higher half of the vector.
1234cabdff1aSopenharmony_ci * Arguments   : Inputs - in_c, in_h, in_l
1235cabdff1aSopenharmony_ci *               Output - out
1236cabdff1aSopenharmony_ci * Details     : The in_h vector and the in_l vector are multiplied after
1237cabdff1aSopenharmony_ci *               the higher half of the two-fold sign extension (signed
1238cabdff1aSopenharmony_ci *               halfword to signed word), and the result is added to
1239cabdff1aSopenharmony_ci *               the vector in_c, then stored to the out vector.
1240cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1241cabdff1aSopenharmony_ci * =============================================================================
1242cabdff1aSopenharmony_ci */
1243cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1244cabdff1aSopenharmony_ci                                          __m256i in_l) {
1245cabdff1aSopenharmony_ci  __m256i tmp0, tmp1, out;
1246cabdff1aSopenharmony_ci
1247cabdff1aSopenharmony_ci  tmp0 = __lasx_xvilvh_h(in_h, in_h);
1248cabdff1aSopenharmony_ci  tmp1 = __lasx_xvilvh_h(in_l, in_l);
1249cabdff1aSopenharmony_ci  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1250cabdff1aSopenharmony_ci  out = __lasx_xvadd_w(tmp0, in_c);
1251cabdff1aSopenharmony_ci  return out;
1252cabdff1aSopenharmony_ci}
1253cabdff1aSopenharmony_ci
1254cabdff1aSopenharmony_ci/*
1255cabdff1aSopenharmony_ci * =============================================================================
1256cabdff1aSopenharmony_ci * Description : Multiplication calculation after expansion of the lower
1257cabdff1aSopenharmony_ci *               half of the vector.
1258cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1259cabdff1aSopenharmony_ci *               Output - out
1260cabdff1aSopenharmony_ci * Details     : The in_h vector and the in_l vector are multiplied after
1261cabdff1aSopenharmony_ci *               the lower half of the two-fold sign extension (signed
1262cabdff1aSopenharmony_ci *               halfword to signed word), then stored to the out vector.
1263cabdff1aSopenharmony_ci * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
1264cabdff1aSopenharmony_ci *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1265cabdff1aSopenharmony_ci *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1266cabdff1aSopenharmony_ci *         out : 6,1,3,0, 0,0,1,0
1267cabdff1aSopenharmony_ci * =============================================================================
1268cabdff1aSopenharmony_ci */
1269cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1270cabdff1aSopenharmony_ci  __m256i tmp0, tmp1, out;
1271cabdff1aSopenharmony_ci
1272cabdff1aSopenharmony_ci  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1273cabdff1aSopenharmony_ci  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1274cabdff1aSopenharmony_ci  out = __lasx_xvmul_w(tmp0, tmp1);
1275cabdff1aSopenharmony_ci  return out;
1276cabdff1aSopenharmony_ci}
1277cabdff1aSopenharmony_ci
1278cabdff1aSopenharmony_ci/*
1279cabdff1aSopenharmony_ci * =============================================================================
1280cabdff1aSopenharmony_ci * Description : Multiplication calculation after expansion of the lower
1281cabdff1aSopenharmony_ci *               half of the vector.
1282cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1283cabdff1aSopenharmony_ci *               Output - out
1284cabdff1aSopenharmony_ci * Details     : The in_h vector and the in_l vector are multiplied after
1285cabdff1aSopenharmony_ci *               the lower half of the two-fold sign extension (signed
1286cabdff1aSopenharmony_ci *               halfword to signed word), then stored to the out vector.
1287cabdff1aSopenharmony_ci * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
1288cabdff1aSopenharmony_ci *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1289cabdff1aSopenharmony_ci *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1290cabdff1aSopenharmony_ci *         out : 0,0,0,0, 0,0,0,1
1291cabdff1aSopenharmony_ci * =============================================================================
1292cabdff1aSopenharmony_ci */
1293cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1294cabdff1aSopenharmony_ci  __m256i tmp0, tmp1, out;
1295cabdff1aSopenharmony_ci
1296cabdff1aSopenharmony_ci  tmp0 = __lasx_xvilvh_h(in_h, in_h);
1297cabdff1aSopenharmony_ci  tmp1 = __lasx_xvilvh_h(in_l, in_l);
1298cabdff1aSopenharmony_ci  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1299cabdff1aSopenharmony_ci  return out;
1300cabdff1aSopenharmony_ci}
1301cabdff1aSopenharmony_ci
1302cabdff1aSopenharmony_ci/*
1303cabdff1aSopenharmony_ci * =============================================================================
1304cabdff1aSopenharmony_ci * Description : The low half of the vector elements are added to the high half
1305cabdff1aSopenharmony_ci *               after being doubled, then saturated.
1306cabdff1aSopenharmony_ci * Arguments   : Inputs - in_h, in_l
1307cabdff1aSopenharmony_ci *               Output - out
1308cabdff1aSopenharmony_ci * Details     : The in_h vector adds the in_l vector after the lower half of
1309cabdff1aSopenharmony_ci *               the two-fold zero extension (unsigned byte to unsigned
1310cabdff1aSopenharmony_ci *               halfword) and then saturated. The results are stored to the out
1311cabdff1aSopenharmony_ci *               vector.
1312cabdff1aSopenharmony_ci * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1313cabdff1aSopenharmony_ci *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1314cabdff1aSopenharmony_ci *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1315cabdff1aSopenharmony_ci *               0,0,0,1
1316cabdff1aSopenharmony_ci *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1317cabdff1aSopenharmony_ci * =============================================================================
1318cabdff1aSopenharmony_ci */
1319cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1320cabdff1aSopenharmony_ci  __m256i tmp1, out;
1321cabdff1aSopenharmony_ci  __m256i zero = { 0 };
1322cabdff1aSopenharmony_ci
1323cabdff1aSopenharmony_ci  tmp1 = __lasx_xvilvl_b(zero, in_l);
1324cabdff1aSopenharmony_ci  out = __lasx_xvsadd_hu(in_h, tmp1);
1325cabdff1aSopenharmony_ci  return out;
1326cabdff1aSopenharmony_ci}
1327cabdff1aSopenharmony_ci
1328cabdff1aSopenharmony_ci/*
1329cabdff1aSopenharmony_ci * =============================================================================
1330cabdff1aSopenharmony_ci * Description : Clip all halfword elements of input vector between min & max
1331cabdff1aSopenharmony_ci *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1332cabdff1aSopenharmony_ci * Arguments   : Inputs  - in    (input vector)
1333cabdff1aSopenharmony_ci *                       - min   (min threshold)
1334cabdff1aSopenharmony_ci *                       - max   (max threshold)
1335cabdff1aSopenharmony_ci *               Outputs - in    (output vector with clipped elements)
1336cabdff1aSopenharmony_ci *               Return Type - signed halfword
1337cabdff1aSopenharmony_ci * Example     : out = __lasx_xvclip_h(in, min, max)
1338cabdff1aSopenharmony_ci *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1339cabdff1aSopenharmony_ci *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1340cabdff1aSopenharmony_ci *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1341cabdff1aSopenharmony_ci *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1342cabdff1aSopenharmony_ci * =============================================================================
1343cabdff1aSopenharmony_ci */
1344cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1345cabdff1aSopenharmony_ci  __m256i out;
1346cabdff1aSopenharmony_ci
1347cabdff1aSopenharmony_ci  out = __lasx_xvmax_h(min, in);
1348cabdff1aSopenharmony_ci  out = __lasx_xvmin_h(max, out);
1349cabdff1aSopenharmony_ci  return out;
1350cabdff1aSopenharmony_ci}
1351cabdff1aSopenharmony_ci
1352cabdff1aSopenharmony_ci/*
1353cabdff1aSopenharmony_ci * =============================================================================
1354cabdff1aSopenharmony_ci * Description : Clip all signed halfword elements of input vector
1355cabdff1aSopenharmony_ci *               between 0 & 255
1356cabdff1aSopenharmony_ci * Arguments   : Inputs  - in   (input vector)
1357cabdff1aSopenharmony_ci *               Outputs - out  (output vector with clipped elements)
1358cabdff1aSopenharmony_ci *               Return Type - signed halfword
1359cabdff1aSopenharmony_ci * Example     : See out = __lasx_xvclip255_w(in)
1360cabdff1aSopenharmony_ci * =============================================================================
1361cabdff1aSopenharmony_ci */
1362cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvclip255_h(__m256i in) {
1363cabdff1aSopenharmony_ci  __m256i out;
1364cabdff1aSopenharmony_ci
1365cabdff1aSopenharmony_ci  out = __lasx_xvmaxi_h(in, 0);
1366cabdff1aSopenharmony_ci  out = __lasx_xvsat_hu(out, 7);
1367cabdff1aSopenharmony_ci  return out;
1368cabdff1aSopenharmony_ci}
1369cabdff1aSopenharmony_ci
1370cabdff1aSopenharmony_ci/*
1371cabdff1aSopenharmony_ci * =============================================================================
1372cabdff1aSopenharmony_ci * Description : Clip all signed word elements of input vector
1373cabdff1aSopenharmony_ci *               between 0 & 255
1374cabdff1aSopenharmony_ci * Arguments   : Inputs - in   (input vector)
1375cabdff1aSopenharmony_ci *               Output - out  (output vector with clipped elements)
1376cabdff1aSopenharmony_ci *               Return Type - signed word
1377cabdff1aSopenharmony_ci * Example     : out = __lasx_xvclip255_w(in)
1378cabdff1aSopenharmony_ci *          in : -8,255,280,249, -8,255,280,249
1379cabdff1aSopenharmony_ci *         out :  0,255,255,249,  0,255,255,249
1380cabdff1aSopenharmony_ci * =============================================================================
1381cabdff1aSopenharmony_ci */
1382cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvclip255_w(__m256i in) {
1383cabdff1aSopenharmony_ci  __m256i out;
1384cabdff1aSopenharmony_ci
1385cabdff1aSopenharmony_ci  out = __lasx_xvmaxi_w(in, 0);
1386cabdff1aSopenharmony_ci  out = __lasx_xvsat_wu(out, 7);
1387cabdff1aSopenharmony_ci  return out;
1388cabdff1aSopenharmony_ci}
1389cabdff1aSopenharmony_ci
1390cabdff1aSopenharmony_ci/*
1391cabdff1aSopenharmony_ci * =============================================================================
1392cabdff1aSopenharmony_ci * Description : Indexed halfword element values are replicated to all
1393cabdff1aSopenharmony_ci *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1394cabdff1aSopenharmony_ci *               if 'idx >= 8' use xvsplati_h_*.
1395cabdff1aSopenharmony_ci * Arguments   : Inputs - in, idx
1396cabdff1aSopenharmony_ci *               Output - out
1397cabdff1aSopenharmony_ci * Details     : Idx element value from in vector is replicated to all
1398cabdff1aSopenharmony_ci *               elements in out vector.
1399cabdff1aSopenharmony_ci *               Valid index range for halfword operation is 0-7
1400cabdff1aSopenharmony_ci * Example     : out = __lasx_xvsplati_l_h(in, idx)
1401cabdff1aSopenharmony_ci *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1402cabdff1aSopenharmony_ci *         idx : 0x02
1403cabdff1aSopenharmony_ci *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1404cabdff1aSopenharmony_ci * =============================================================================
1405cabdff1aSopenharmony_ci */
1406cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1407cabdff1aSopenharmony_ci  __m256i out;
1408cabdff1aSopenharmony_ci
1409cabdff1aSopenharmony_ci  out = __lasx_xvpermi_q(in, in, 0x02);
1410cabdff1aSopenharmony_ci  out = __lasx_xvreplve_h(out, idx);
1411cabdff1aSopenharmony_ci  return out;
1412cabdff1aSopenharmony_ci}
1413cabdff1aSopenharmony_ci
1414cabdff1aSopenharmony_ci/*
1415cabdff1aSopenharmony_ci * =============================================================================
1416cabdff1aSopenharmony_ci * Description : Indexed halfword element values are replicated to all
1417cabdff1aSopenharmony_ci *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1418cabdff1aSopenharmony_ci *               if 'idx >= 8' use xvsplati_h_*.
1419cabdff1aSopenharmony_ci * Arguments   : Inputs - in, idx
1420cabdff1aSopenharmony_ci *               Output - out
1421cabdff1aSopenharmony_ci * Details     : Idx element value from in vector is replicated to all
1422cabdff1aSopenharmony_ci *               elements in out vector.
1423cabdff1aSopenharmony_ci *               Valid index range for halfword operation is 0-7
1424cabdff1aSopenharmony_ci * Example     : out = __lasx_xvsplati_h_h(in, idx)
1425cabdff1aSopenharmony_ci *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1426cabdff1aSopenharmony_ci *         idx : 0x09
1427cabdff1aSopenharmony_ci *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1428cabdff1aSopenharmony_ci * =============================================================================
1429cabdff1aSopenharmony_ci */
1430cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1431cabdff1aSopenharmony_ci  __m256i out;
1432cabdff1aSopenharmony_ci
1433cabdff1aSopenharmony_ci  out = __lasx_xvpermi_q(in, in, 0x13);
1434cabdff1aSopenharmony_ci  out = __lasx_xvreplve_h(out, idx);
1435cabdff1aSopenharmony_ci  return out;
1436cabdff1aSopenharmony_ci}
1437cabdff1aSopenharmony_ci
1438cabdff1aSopenharmony_ci/*
1439cabdff1aSopenharmony_ci * =============================================================================
1440cabdff1aSopenharmony_ci * Description : Transpose 4x4 block with double-word elements in vectors
1441cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1442cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3
1443cabdff1aSopenharmony_ci * Example     : LASX_TRANSPOSE4x4_D
1444cabdff1aSopenharmony_ci *        _in0 : 1,2,3,4
1445cabdff1aSopenharmony_ci *        _in1 : 1,2,3,4
1446cabdff1aSopenharmony_ci *        _in2 : 1,2,3,4
1447cabdff1aSopenharmony_ci *        _in3 : 1,2,3,4
1448cabdff1aSopenharmony_ci *
1449cabdff1aSopenharmony_ci *       _out0 : 1,1,1,1
1450cabdff1aSopenharmony_ci *       _out1 : 2,2,2,2
1451cabdff1aSopenharmony_ci *       _out2 : 3,3,3,3
1452cabdff1aSopenharmony_ci *       _out3 : 4,4,4,4
1453cabdff1aSopenharmony_ci * =============================================================================
1454cabdff1aSopenharmony_ci */
1455cabdff1aSopenharmony_ci#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1456cabdff1aSopenharmony_ci                            _out3)                                       \
1457cabdff1aSopenharmony_ci  {                                                                      \
1458cabdff1aSopenharmony_ci    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
1459cabdff1aSopenharmony_ci    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
1460cabdff1aSopenharmony_ci    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
1461cabdff1aSopenharmony_ci    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
1462cabdff1aSopenharmony_ci    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
1463cabdff1aSopenharmony_ci    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
1464cabdff1aSopenharmony_ci    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
1465cabdff1aSopenharmony_ci    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
1466cabdff1aSopenharmony_ci    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
1467cabdff1aSopenharmony_ci  }
1468cabdff1aSopenharmony_ci
1469cabdff1aSopenharmony_ci/*
1470cabdff1aSopenharmony_ci * =============================================================================
1471cabdff1aSopenharmony_ci * Description : Transpose 8x8 block with word elements in vectors
1472cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1473cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1474cabdff1aSopenharmony_ci *               _out7
1475cabdff1aSopenharmony_ci * Example     : LASX_TRANSPOSE8x8_W
1476cabdff1aSopenharmony_ci *        _in0 : 1,2,3,4,5,6,7,8
1477cabdff1aSopenharmony_ci *        _in1 : 2,2,3,4,5,6,7,8
1478cabdff1aSopenharmony_ci *        _in2 : 3,2,3,4,5,6,7,8
1479cabdff1aSopenharmony_ci *        _in3 : 4,2,3,4,5,6,7,8
1480cabdff1aSopenharmony_ci *        _in4 : 5,2,3,4,5,6,7,8
1481cabdff1aSopenharmony_ci *        _in5 : 6,2,3,4,5,6,7,8
1482cabdff1aSopenharmony_ci *        _in6 : 7,2,3,4,5,6,7,8
1483cabdff1aSopenharmony_ci *        _in7 : 8,2,3,4,5,6,7,8
1484cabdff1aSopenharmony_ci *
1485cabdff1aSopenharmony_ci *       _out0 : 1,2,3,4,5,6,7,8
1486cabdff1aSopenharmony_ci *       _out1 : 2,2,2,2,2,2,2,2
1487cabdff1aSopenharmony_ci *       _out2 : 3,3,3,3,3,3,3,3
1488cabdff1aSopenharmony_ci *       _out3 : 4,4,4,4,4,4,4,4
1489cabdff1aSopenharmony_ci *       _out4 : 5,5,5,5,5,5,5,5
1490cabdff1aSopenharmony_ci *       _out5 : 6,6,6,6,6,6,6,6
1491cabdff1aSopenharmony_ci *       _out6 : 7,7,7,7,7,7,7,7
1492cabdff1aSopenharmony_ci *       _out7 : 8,8,8,8,8,8,8,8
1493cabdff1aSopenharmony_ci * =============================================================================
1494cabdff1aSopenharmony_ci */
1495cabdff1aSopenharmony_ci#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1496cabdff1aSopenharmony_ci                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1497cabdff1aSopenharmony_ci                            _out7)                                           \
1498cabdff1aSopenharmony_ci  {                                                                          \
1499cabdff1aSopenharmony_ci    __m256i _s0_m, _s1_m;                                                    \
1500cabdff1aSopenharmony_ci    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1501cabdff1aSopenharmony_ci    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1502cabdff1aSopenharmony_ci                                                                             \
1503cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
1504cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
1505cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1506cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1507cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
1508cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
1509cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1510cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1511cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
1512cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
1513cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1514cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1515cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
1516cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
1517cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1518cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1519cabdff1aSopenharmony_ci    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
1520cabdff1aSopenharmony_ci    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
1521cabdff1aSopenharmony_ci    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
1522cabdff1aSopenharmony_ci    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
1523cabdff1aSopenharmony_ci    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
1524cabdff1aSopenharmony_ci    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
1525cabdff1aSopenharmony_ci    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
1526cabdff1aSopenharmony_ci    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
1527cabdff1aSopenharmony_ci  }
1528cabdff1aSopenharmony_ci
1529cabdff1aSopenharmony_ci/*
1530cabdff1aSopenharmony_ci * =============================================================================
1531cabdff1aSopenharmony_ci * Description : Transpose input 16x8 byte block
1532cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1533cabdff1aSopenharmony_ci *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1534cabdff1aSopenharmony_ci *                         (input 16x8 byte block)
1535cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1536cabdff1aSopenharmony_ci *                         _out7 (output 8x16 byte block)
1537cabdff1aSopenharmony_ci * Details     : The rows of the matrix become columns, and the columns become
1538cabdff1aSopenharmony_ci *               rows.
1539cabdff1aSopenharmony_ci * Example     : See LASX_TRANSPOSE16x8_H
1540cabdff1aSopenharmony_ci * =============================================================================
1541cabdff1aSopenharmony_ci */
1542cabdff1aSopenharmony_ci#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1543cabdff1aSopenharmony_ci                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1544cabdff1aSopenharmony_ci                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1545cabdff1aSopenharmony_ci                             _out6, _out7)                                    \
1546cabdff1aSopenharmony_ci  {                                                                           \
1547cabdff1aSopenharmony_ci    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1548cabdff1aSopenharmony_ci    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1549cabdff1aSopenharmony_ci                                                                              \
1550cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
1551cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
1552cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
1553cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
1554cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
1555cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
1556cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
1557cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
1558cabdff1aSopenharmony_ci    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
1559cabdff1aSopenharmony_ci    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
1560cabdff1aSopenharmony_ci    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
1561cabdff1aSopenharmony_ci    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
1562cabdff1aSopenharmony_ci    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
1563cabdff1aSopenharmony_ci    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
1564cabdff1aSopenharmony_ci    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
1565cabdff1aSopenharmony_ci    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
1566cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
1567cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
1568cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
1569cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
1570cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
1571cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
1572cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
1573cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
1574cabdff1aSopenharmony_ci    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
1575cabdff1aSopenharmony_ci    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
1576cabdff1aSopenharmony_ci    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
1577cabdff1aSopenharmony_ci    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
1578cabdff1aSopenharmony_ci    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
1579cabdff1aSopenharmony_ci    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
1580cabdff1aSopenharmony_ci    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
1581cabdff1aSopenharmony_ci    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
1582cabdff1aSopenharmony_ci  }
1583cabdff1aSopenharmony_ci
1584cabdff1aSopenharmony_ci/*
1585cabdff1aSopenharmony_ci * =============================================================================
1586cabdff1aSopenharmony_ci * Description : Transpose input 16x8 byte block
1587cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1588cabdff1aSopenharmony_ci *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1589cabdff1aSopenharmony_ci *                         (input 16x8 byte block)
1590cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1591cabdff1aSopenharmony_ci *                         _out7 (output 8x16 byte block)
1592cabdff1aSopenharmony_ci * Details     : The rows of the matrix become columns, and the columns become
1593cabdff1aSopenharmony_ci *               rows.
1594cabdff1aSopenharmony_ci * Example     : LASX_TRANSPOSE16x8_H
1595cabdff1aSopenharmony_ci *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1596cabdff1aSopenharmony_ci *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1597cabdff1aSopenharmony_ci *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1598cabdff1aSopenharmony_ci *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1599cabdff1aSopenharmony_ci *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1600cabdff1aSopenharmony_ci *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1601cabdff1aSopenharmony_ci *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1602cabdff1aSopenharmony_ci *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1603cabdff1aSopenharmony_ci *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1604cabdff1aSopenharmony_ci *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1605cabdff1aSopenharmony_ci *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1606cabdff1aSopenharmony_ci *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1607cabdff1aSopenharmony_ci *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1608cabdff1aSopenharmony_ci *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1609cabdff1aSopenharmony_ci *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1610cabdff1aSopenharmony_ci *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1611cabdff1aSopenharmony_ci *
1612cabdff1aSopenharmony_ci *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1613cabdff1aSopenharmony_ci *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1614cabdff1aSopenharmony_ci *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1615cabdff1aSopenharmony_ci *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1616cabdff1aSopenharmony_ci *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1617cabdff1aSopenharmony_ci *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1618cabdff1aSopenharmony_ci *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1619cabdff1aSopenharmony_ci *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1620cabdff1aSopenharmony_ci * =============================================================================
1621cabdff1aSopenharmony_ci */
1622cabdff1aSopenharmony_ci#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1623cabdff1aSopenharmony_ci                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1624cabdff1aSopenharmony_ci                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1625cabdff1aSopenharmony_ci                             _out6, _out7)                                    \
1626cabdff1aSopenharmony_ci  {                                                                           \
1627cabdff1aSopenharmony_ci    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1628cabdff1aSopenharmony_ci    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1629cabdff1aSopenharmony_ci    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
1630cabdff1aSopenharmony_ci                                                                              \
1631cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
1632cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
1633cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
1634cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
1635cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
1636cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
1637cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
1638cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
1639cabdff1aSopenharmony_ci    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1640cabdff1aSopenharmony_ci    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1641cabdff1aSopenharmony_ci    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1642cabdff1aSopenharmony_ci    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1643cabdff1aSopenharmony_ci    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1644cabdff1aSopenharmony_ci    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1645cabdff1aSopenharmony_ci    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1646cabdff1aSopenharmony_ci    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1647cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1648cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1649cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1650cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1651cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1652cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1653cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1654cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1655cabdff1aSopenharmony_ci    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1656cabdff1aSopenharmony_ci    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1657cabdff1aSopenharmony_ci    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1658cabdff1aSopenharmony_ci    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1659cabdff1aSopenharmony_ci                                                                              \
1660cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
1661cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
1662cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
1663cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
1664cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
1665cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
1666cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
1667cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
1668cabdff1aSopenharmony_ci    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1669cabdff1aSopenharmony_ci    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1670cabdff1aSopenharmony_ci    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1671cabdff1aSopenharmony_ci    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1672cabdff1aSopenharmony_ci    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1673cabdff1aSopenharmony_ci    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1674cabdff1aSopenharmony_ci    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1675cabdff1aSopenharmony_ci    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1676cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1677cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1678cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1679cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1680cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1681cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1682cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1683cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1684cabdff1aSopenharmony_ci    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1685cabdff1aSopenharmony_ci    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1686cabdff1aSopenharmony_ci    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1687cabdff1aSopenharmony_ci    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1688cabdff1aSopenharmony_ci  }
1689cabdff1aSopenharmony_ci
1690cabdff1aSopenharmony_ci/*
1691cabdff1aSopenharmony_ci * =============================================================================
1692cabdff1aSopenharmony_ci * Description : Transpose 4x4 block with halfword elements in vectors
1693cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1694cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3
1695cabdff1aSopenharmony_ci *               Return Type - signed halfword
1696cabdff1aSopenharmony_ci * Details     : The rows of the matrix become columns, and the columns become
1697cabdff1aSopenharmony_ci *               rows.
1698cabdff1aSopenharmony_ci * Example     : See LASX_TRANSPOSE8x8_H
1699cabdff1aSopenharmony_ci * =============================================================================
1700cabdff1aSopenharmony_ci */
1701cabdff1aSopenharmony_ci#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1702cabdff1aSopenharmony_ci                            _out3)                                       \
1703cabdff1aSopenharmony_ci  {                                                                      \
1704cabdff1aSopenharmony_ci    __m256i _s0_m, _s1_m;                                                \
1705cabdff1aSopenharmony_ci                                                                         \
1706cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
1707cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
1708cabdff1aSopenharmony_ci    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
1709cabdff1aSopenharmony_ci    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
1710cabdff1aSopenharmony_ci    _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
1711cabdff1aSopenharmony_ci    _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
1712cabdff1aSopenharmony_ci  }
1713cabdff1aSopenharmony_ci
1714cabdff1aSopenharmony_ci/*
1715cabdff1aSopenharmony_ci * =============================================================================
1716cabdff1aSopenharmony_ci * Description : Transpose input 8x8 byte block
1717cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1718cabdff1aSopenharmony_ci *                         (input 8x8 byte block)
1719cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1720cabdff1aSopenharmony_ci *                         _out7 (output 8x8 byte block)
1721cabdff1aSopenharmony_ci * Example     : See LASX_TRANSPOSE8x8_H
1722cabdff1aSopenharmony_ci * =============================================================================
1723cabdff1aSopenharmony_ci */
1724cabdff1aSopenharmony_ci#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1725cabdff1aSopenharmony_ci                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1726cabdff1aSopenharmony_ci                            _out7)                                           \
1727cabdff1aSopenharmony_ci  {                                                                          \
1728cabdff1aSopenharmony_ci    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1729cabdff1aSopenharmony_ci    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1730cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
1731cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
1732cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
1733cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
1734cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
1735cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
1736cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
1737cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
1738cabdff1aSopenharmony_ci    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
1739cabdff1aSopenharmony_ci    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
1740cabdff1aSopenharmony_ci    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
1741cabdff1aSopenharmony_ci    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
1742cabdff1aSopenharmony_ci    _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
1743cabdff1aSopenharmony_ci    _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
1744cabdff1aSopenharmony_ci    _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
1745cabdff1aSopenharmony_ci    _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
1746cabdff1aSopenharmony_ci  }
1747cabdff1aSopenharmony_ci
1748cabdff1aSopenharmony_ci/*
1749cabdff1aSopenharmony_ci * =============================================================================
1750cabdff1aSopenharmony_ci * Description : Transpose 8x8 block with halfword elements in vectors.
1751cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, ~
1752cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, ~
1753cabdff1aSopenharmony_ci * Details     : The rows of the matrix become columns, and the columns become
1754cabdff1aSopenharmony_ci *               rows.
1755cabdff1aSopenharmony_ci * Example     : LASX_TRANSPOSE8x8_H
1756cabdff1aSopenharmony_ci *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1757cabdff1aSopenharmony_ci *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1758cabdff1aSopenharmony_ci *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1759cabdff1aSopenharmony_ci *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1760cabdff1aSopenharmony_ci *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1761cabdff1aSopenharmony_ci *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1762cabdff1aSopenharmony_ci *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1763cabdff1aSopenharmony_ci *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1764cabdff1aSopenharmony_ci *
1765cabdff1aSopenharmony_ci *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1766cabdff1aSopenharmony_ci *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1767cabdff1aSopenharmony_ci *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1768cabdff1aSopenharmony_ci *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1769cabdff1aSopenharmony_ci *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1770cabdff1aSopenharmony_ci *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1771cabdff1aSopenharmony_ci *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1772cabdff1aSopenharmony_ci *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1773cabdff1aSopenharmony_ci * =============================================================================
1774cabdff1aSopenharmony_ci */
1775cabdff1aSopenharmony_ci#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1776cabdff1aSopenharmony_ci                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1777cabdff1aSopenharmony_ci                            _out7)                                           \
1778cabdff1aSopenharmony_ci  {                                                                          \
1779cabdff1aSopenharmony_ci    __m256i _s0_m, _s1_m;                                                    \
1780cabdff1aSopenharmony_ci    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1781cabdff1aSopenharmony_ci    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1782cabdff1aSopenharmony_ci                                                                             \
1783cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
1784cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
1785cabdff1aSopenharmony_ci    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1786cabdff1aSopenharmony_ci    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1787cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
1788cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
1789cabdff1aSopenharmony_ci    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1790cabdff1aSopenharmony_ci    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1791cabdff1aSopenharmony_ci                                                                             \
1792cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
1793cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
1794cabdff1aSopenharmony_ci    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1795cabdff1aSopenharmony_ci    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1796cabdff1aSopenharmony_ci    _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
1797cabdff1aSopenharmony_ci    _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
1798cabdff1aSopenharmony_ci    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1799cabdff1aSopenharmony_ci    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1800cabdff1aSopenharmony_ci                                                                             \
1801cabdff1aSopenharmony_ci    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
1802cabdff1aSopenharmony_ci    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
1803cabdff1aSopenharmony_ci    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
1804cabdff1aSopenharmony_ci    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
1805cabdff1aSopenharmony_ci    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
1806cabdff1aSopenharmony_ci    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
1807cabdff1aSopenharmony_ci    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
1808cabdff1aSopenharmony_ci    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
1809cabdff1aSopenharmony_ci  }
1810cabdff1aSopenharmony_ci
1811cabdff1aSopenharmony_ci/*
1812cabdff1aSopenharmony_ci * =============================================================================
1813cabdff1aSopenharmony_ci * Description : Butterfly of 4 input vectors
1814cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1815cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3
1816cabdff1aSopenharmony_ci * Details     : Butterfly operation
1817cabdff1aSopenharmony_ci * Example     : LASX_BUTTERFLY_4
1818cabdff1aSopenharmony_ci *               _out0 = _in0 + _in3;
1819cabdff1aSopenharmony_ci *               _out1 = _in1 + _in2;
1820cabdff1aSopenharmony_ci *               _out2 = _in1 - _in2;
1821cabdff1aSopenharmony_ci *               _out3 = _in0 - _in3;
1822cabdff1aSopenharmony_ci * =============================================================================
1823cabdff1aSopenharmony_ci */
1824cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1825cabdff1aSopenharmony_ci  {                                                                            \
1826cabdff1aSopenharmony_ci    _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
1827cabdff1aSopenharmony_ci    _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
1828cabdff1aSopenharmony_ci    _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
1829cabdff1aSopenharmony_ci    _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
1830cabdff1aSopenharmony_ci  }
1831cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1832cabdff1aSopenharmony_ci  {                                                                            \
1833cabdff1aSopenharmony_ci    _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
1834cabdff1aSopenharmony_ci    _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
1835cabdff1aSopenharmony_ci    _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
1836cabdff1aSopenharmony_ci    _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
1837cabdff1aSopenharmony_ci  }
1838cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1839cabdff1aSopenharmony_ci  {                                                                            \
1840cabdff1aSopenharmony_ci    _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
1841cabdff1aSopenharmony_ci    _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
1842cabdff1aSopenharmony_ci    _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
1843cabdff1aSopenharmony_ci    _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
1844cabdff1aSopenharmony_ci  }
1845cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1846cabdff1aSopenharmony_ci  {                                                                            \
1847cabdff1aSopenharmony_ci    _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
1848cabdff1aSopenharmony_ci    _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
1849cabdff1aSopenharmony_ci    _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
1850cabdff1aSopenharmony_ci    _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
1851cabdff1aSopenharmony_ci  }
1852cabdff1aSopenharmony_ci
1853cabdff1aSopenharmony_ci/*
1854cabdff1aSopenharmony_ci * =============================================================================
1855cabdff1aSopenharmony_ci * Description : Butterfly of 8 input vectors
1856cabdff1aSopenharmony_ci * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
1857cabdff1aSopenharmony_ci *               Outputs - _out0, _out1, _out2, _out3, ~
1858cabdff1aSopenharmony_ci * Details     : Butterfly operation
1859cabdff1aSopenharmony_ci * Example     : LASX_BUTTERFLY_8
1860cabdff1aSopenharmony_ci *               _out0 = _in0 + _in7;
1861cabdff1aSopenharmony_ci *               _out1 = _in1 + _in6;
1862cabdff1aSopenharmony_ci *               _out2 = _in2 + _in5;
1863cabdff1aSopenharmony_ci *               _out3 = _in3 + _in4;
1864cabdff1aSopenharmony_ci *               _out4 = _in3 - _in4;
1865cabdff1aSopenharmony_ci *               _out5 = _in2 - _in5;
1866cabdff1aSopenharmony_ci *               _out6 = _in1 - _in6;
1867cabdff1aSopenharmony_ci *               _out7 = _in0 - _in7;
1868cabdff1aSopenharmony_ci * =============================================================================
1869cabdff1aSopenharmony_ci */
1870cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1871cabdff1aSopenharmony_ci                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1872cabdff1aSopenharmony_ci                           _out7)                                           \
1873cabdff1aSopenharmony_ci  {                                                                         \
1874cabdff1aSopenharmony_ci    _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
1875cabdff1aSopenharmony_ci    _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
1876cabdff1aSopenharmony_ci    _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
1877cabdff1aSopenharmony_ci    _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
1878cabdff1aSopenharmony_ci    _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
1879cabdff1aSopenharmony_ci    _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
1880cabdff1aSopenharmony_ci    _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
1881cabdff1aSopenharmony_ci    _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
1882cabdff1aSopenharmony_ci  }
1883cabdff1aSopenharmony_ci
1884cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1885cabdff1aSopenharmony_ci                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1886cabdff1aSopenharmony_ci                           _out7)                                           \
1887cabdff1aSopenharmony_ci  {                                                                         \
1888cabdff1aSopenharmony_ci    _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
1889cabdff1aSopenharmony_ci    _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
1890cabdff1aSopenharmony_ci    _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
1891cabdff1aSopenharmony_ci    _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
1892cabdff1aSopenharmony_ci    _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
1893cabdff1aSopenharmony_ci    _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
1894cabdff1aSopenharmony_ci    _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
1895cabdff1aSopenharmony_ci    _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
1896cabdff1aSopenharmony_ci  }
1897cabdff1aSopenharmony_ci
1898cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1899cabdff1aSopenharmony_ci                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1900cabdff1aSopenharmony_ci                           _out7)                                           \
1901cabdff1aSopenharmony_ci  {                                                                         \
1902cabdff1aSopenharmony_ci    _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
1903cabdff1aSopenharmony_ci    _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
1904cabdff1aSopenharmony_ci    _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
1905cabdff1aSopenharmony_ci    _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
1906cabdff1aSopenharmony_ci    _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
1907cabdff1aSopenharmony_ci    _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
1908cabdff1aSopenharmony_ci    _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
1909cabdff1aSopenharmony_ci    _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
1910cabdff1aSopenharmony_ci  }
1911cabdff1aSopenharmony_ci
1912cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1913cabdff1aSopenharmony_ci                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1914cabdff1aSopenharmony_ci                           _out7)                                           \
1915cabdff1aSopenharmony_ci  {                                                                         \
1916cabdff1aSopenharmony_ci    _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
1917cabdff1aSopenharmony_ci    _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
1918cabdff1aSopenharmony_ci    _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
1919cabdff1aSopenharmony_ci    _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
1920cabdff1aSopenharmony_ci    _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
1921cabdff1aSopenharmony_ci    _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
1922cabdff1aSopenharmony_ci    _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
1923cabdff1aSopenharmony_ci    _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
1924cabdff1aSopenharmony_ci  }
1925cabdff1aSopenharmony_ci
1926cabdff1aSopenharmony_ci#endif  // LASX
1927cabdff1aSopenharmony_ci
1928cabdff1aSopenharmony_ci/*
1929cabdff1aSopenharmony_ci * =============================================================================
1930cabdff1aSopenharmony_ci * Description : Print out elements in vector.
1931cabdff1aSopenharmony_ci * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
1932cabdff1aSopenharmony_ci *               Outputs -
1933cabdff1aSopenharmony_ci * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1934cabdff1aSopenharmony_ci *               '_enter' is TRUE, prefix "\nVP:" will be added first.
1935cabdff1aSopenharmony_ci * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1936cabdff1aSopenharmony_ci *               VP:1,2,3,4,
1937cabdff1aSopenharmony_ci * =============================================================================
1938cabdff1aSopenharmony_ci */
1939cabdff1aSopenharmony_ci#define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
1940cabdff1aSopenharmony_ci  {                                                                \
1941cabdff1aSopenharmony_ci    RTYPE _tmp0 = (RTYPE)in0;                                      \
1942cabdff1aSopenharmony_ci    int _i = 0;                                                    \
1943cabdff1aSopenharmony_ci    if (enter) printf("\nVP:");                                    \
1944cabdff1aSopenharmony_ci    for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
1945cabdff1aSopenharmony_ci  }
1946cabdff1aSopenharmony_ci
1947cabdff1aSopenharmony_ci#endif /* LOONGSON_INTRINSICS_H */
1948cabdff1aSopenharmony_ci#endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
1949