1/* 2 * Copyright (c) 2021 Loongson Technology Corporation Limited 3 * All rights reserved. 4 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 5 * Xiwei Gu <guxiwei-hf@loongson.cn> 6 * Lu Wang <wanglu@loongson.cn> 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 * 24 */ 25 26#ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H 27#define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H 28 29/* 30 * Copyright (c) 2021 Loongson Technology Corporation Limited 31 * All rights reserved. 32 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 33 * Xiwei Gu <guxiwei-hf@loongson.cn> 34 * Lu Wang <wanglu@loongson.cn> 35 * 36 * This file is a header file for loongarch builtin extension. 37 * 38 */ 39 40#ifndef LOONGSON_INTRINSICS_H 41#define LOONGSON_INTRINSICS_H 42 43/** 44 * MAJOR version: Macro usage changes. 45 * MINOR version: Add new functions, or bug fixes. 46 * MICRO version: Comment changes or implementation changes. 47 */ 48#define LSOM_VERSION_MAJOR 1 49#define LSOM_VERSION_MINOR 1 50#define LSOM_VERSION_MICRO 0 51 52#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ 53 { \ 54 _OUT0 = _INS(_IN0); \ 55 _OUT1 = _INS(_IN1); \ 56 } 57 58#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \ 59 { \ 60 _OUT0 = _INS(_IN0, _IN1); \ 61 _OUT1 = _INS(_IN2, _IN3); \ 62 } 63 64#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \ 65 { \ 66 _OUT0 = _INS(_IN0, _IN1, _IN2); \ 67 _OUT1 = _INS(_IN3, _IN4, _IN5); \ 68 } 69 70#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \ 71 { \ 72 DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \ 73 DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \ 74 } 75 76#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \ 77 _OUT1, _OUT2, _OUT3) \ 78 { \ 79 DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \ 80 DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \ 81 } 82 83#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \ 84 _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \ 85 { \ 86 DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \ 87 DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \ 88 } 89 90#ifdef __loongarch_sx 91#include <lsxintrin.h> 92/* 93 * ============================================================================= 94 * Description : Dot product & addition of byte vector elements 95 * Arguments : Inputs - in_c, in_h, in_l 96 * Outputs - out 97 * Return Type - halfword 98 * Details : Signed byte elements from in_h are multiplied by 99 * signed byte elements from in_l, and then added adjacent to 100 * each other to get results with the twice size of input. 101 * Then the results plus to signed half-word elements from in_c. 102 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) 103 * in_c : 1,2,3,4, 1,2,3,4 104 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 105 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 106 * out : 23,40,41,26, 23,40,41,26 107 * ============================================================================= 108 */ 109static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, 110 __m128i in_l) { 111 __m128i out; 112 113 out = __lsx_vmaddwev_h_b(in_c, in_h, in_l); 114 out = __lsx_vmaddwod_h_b(out, in_h, in_l); 115 return out; 116} 117 118/* 119 * ============================================================================= 120 * Description : Dot product & addition of byte vector elements 121 * Arguments : Inputs - in_c, in_h, in_l 122 * Outputs - out 123 * Return Type - halfword 124 * Details : Unsigned byte elements from in_h are multiplied by 125 * unsigned byte elements from in_l, and then added adjacent to 126 * each other to get results with the twice size of input. 127 * The results plus to signed half-word elements from in_c. 128 * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l) 129 * in_c : 1,2,3,4, 1,2,3,4 130 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 131 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 132 * out : 23,40,41,26, 23,40,41,26 133 * ============================================================================= 134 */ 135static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, 136 __m128i in_l) { 137 __m128i out; 138 139 out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l); 140 out = __lsx_vmaddwod_h_bu(out, in_h, in_l); 141 return out; 142} 143 144/* 145 * ============================================================================= 146 * Description : Dot product & addition of byte vector elements 147 * Arguments : Inputs - in_c, in_h, in_l 148 * Outputs - out 149 * Return Type - halfword 150 * Details : Unsigned byte elements from in_h are multiplied by 151 * signed byte elements from in_l, and then added adjacent to 152 * each other to get results with the twice size of input. 153 * The results plus to signed half-word elements from in_c. 154 * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l) 155 * in_c : 1,1,1,1, 1,1,1,1 156 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 157 * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8 158 * out : -4,-24,-60,-112, 6,26,62,114 159 * ============================================================================= 160 */ 161static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, 162 __m128i in_l) { 163 __m128i out; 164 165 out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l); 166 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); 167 return out; 168} 169 170/* 171 * ============================================================================= 172 * Description : Dot product & addition of half-word vector elements 173 * Arguments : Inputs - in_c, in_h, in_l 174 * Outputs - out 175 * Return Type - __m128i 176 * Details : Signed half-word elements from in_h are multiplied by 177 * signed half-word elements from in_l, and then added adjacent to 178 * each other to get results with the twice size of input. 179 * Then the results plus to signed word elements from in_c. 180 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) 181 * in_c : 1,2,3,4 182 * in_h : 1,2,3,4, 5,6,7,8 183 * in_l : 8,7,6,5, 4,3,2,1 184 * out : 23,40,41,26 185 * ============================================================================= 186 */ 187static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, 188 __m128i in_l) { 189 __m128i out; 190 191 out = __lsx_vmaddwev_w_h(in_c, in_h, in_l); 192 out = __lsx_vmaddwod_w_h(out, in_h, in_l); 193 return out; 194} 195 196/* 197 * ============================================================================= 198 * Description : Dot product of byte vector elements 199 * Arguments : Inputs - in_h, in_l 200 * Outputs - out 201 * Return Type - halfword 202 * Details : Signed byte elements from in_h are multiplied by 203 * signed byte elements from in_l, and then added adjacent to 204 * each other to get results with the twice size of input. 205 * Example : out = __lsx_vdp2_h_b(in_h, in_l) 206 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 207 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 208 * out : 22,38,38,22, 22,38,38,22 209 * ============================================================================= 210 */ 211static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { 212 __m128i out; 213 214 out = __lsx_vmulwev_h_b(in_h, in_l); 215 out = __lsx_vmaddwod_h_b(out, in_h, in_l); 216 return out; 217} 218 219/* 220 * ============================================================================= 221 * Description : Dot product of byte vector elements 222 * Arguments : Inputs - in_h, in_l 223 * Outputs - out 224 * Return Type - halfword 225 * Details : Unsigned byte elements from in_h are multiplied by 226 * unsigned byte elements from in_l, and then added adjacent to 227 * each other to get results with the twice size of input. 228 * Example : out = __lsx_vdp2_h_bu(in_h, in_l) 229 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 230 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 231 * out : 22,38,38,22, 22,38,38,22 232 * ============================================================================= 233 */ 234static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { 235 __m128i out; 236 237 out = __lsx_vmulwev_h_bu(in_h, in_l); 238 out = __lsx_vmaddwod_h_bu(out, in_h, in_l); 239 return out; 240} 241 242/* 243 * ============================================================================= 244 * Description : Dot product of byte vector elements 245 * Arguments : Inputs - in_h, in_l 246 * Outputs - out 247 * Return Type - halfword 248 * Details : Unsigned byte elements from in_h are multiplied by 249 * signed byte elements from in_l, and then added adjacent to 250 * each other to get results with the twice size of input. 251 * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l) 252 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 253 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1 254 * out : 22,38,38,22, 22,38,38,6 255 * ============================================================================= 256 */ 257static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { 258 __m128i out; 259 260 out = __lsx_vmulwev_h_bu_b(in_h, in_l); 261 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); 262 return out; 263} 264 265/* 266 * ============================================================================= 267 * Description : Dot product of byte vector elements 268 * Arguments : Inputs - in_h, in_l 269 * Outputs - out 270 * Return Type - halfword 271 * Details : Signed byte elements from in_h are multiplied by 272 * signed byte elements from in_l, and then added adjacent to 273 * each other to get results with the twice size of input. 274 * Example : out = __lsx_vdp2_w_h(in_h, in_l) 275 * in_h : 1,2,3,4, 5,6,7,8 276 * in_l : 8,7,6,5, 4,3,2,1 277 * out : 22,38,38,22 278 * ============================================================================= 279 */ 280static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { 281 __m128i out; 282 283 out = __lsx_vmulwev_w_h(in_h, in_l); 284 out = __lsx_vmaddwod_w_h(out, in_h, in_l); 285 return out; 286} 287 288/* 289 * ============================================================================= 290 * Description : Clip all halfword elements of input vector between min & max 291 * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : 292 * (_in)) 293 * Arguments : Inputs - _in (input vector) 294 * - min (min threshold) 295 * - max (max threshold) 296 * Outputs - out (output vector with clipped elements) 297 * Return Type - signed halfword 298 * Example : out = __lsx_vclip_h(_in) 299 * _in : -8,2,280,249, -8,255,280,249 300 * min : 1,1,1,1, 1,1,1,1 301 * max : 9,9,9,9, 9,9,9,9 302 * out : 1,2,9,9, 1,9,9,9 303 * ============================================================================= 304 */ 305static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) { 306 __m128i out; 307 308 out = __lsx_vmax_h(min, _in); 309 out = __lsx_vmin_h(max, out); 310 return out; 311} 312 313/* 314 * ============================================================================= 315 * Description : Set each element of vector between 0 and 255 316 * Arguments : Inputs - _in 317 * Outputs - out 318 * Return Type - halfword 319 * Details : Signed byte elements from _in are clamped between 0 and 255. 320 * Example : out = __lsx_vclip255_h(_in) 321 * _in : -8,255,280,249, -8,255,280,249 322 * out : 0,255,255,249, 0,255,255,249 323 * ============================================================================= 324 */ 325static inline __m128i __lsx_vclip255_h(__m128i _in) { 326 __m128i out; 327 328 out = __lsx_vmaxi_h(_in, 0); 329 out = __lsx_vsat_hu(out, 7); 330 return out; 331} 332 333/* 334 * ============================================================================= 335 * Description : Set each element of vector between 0 and 255 336 * Arguments : Inputs - _in 337 * Outputs - out 338 * Return Type - word 339 * Details : Signed byte elements from _in are clamped between 0 and 255. 340 * Example : out = __lsx_vclip255_w(_in) 341 * _in : -8,255,280,249 342 * out : 0,255,255,249 343 * ============================================================================= 344 */ 345static inline __m128i __lsx_vclip255_w(__m128i _in) { 346 __m128i out; 347 348 out = __lsx_vmaxi_w(_in, 0); 349 out = __lsx_vsat_wu(out, 7); 350 return out; 351} 352 353/* 354 * ============================================================================= 355 * Description : Swap two variables 356 * Arguments : Inputs - _in0, _in1 357 * Outputs - _in0, _in1 (in-place) 358 * Details : Swapping of two input variables using xor 359 * Example : LSX_SWAP(_in0, _in1) 360 * _in0 : 1,2,3,4 361 * _in1 : 5,6,7,8 362 * _in0(out) : 5,6,7,8 363 * _in1(out) : 1,2,3,4 364 * ============================================================================= 365 */ 366#define LSX_SWAP(_in0, _in1) \ 367 { \ 368 _in0 = __lsx_vxor_v(_in0, _in1); \ 369 _in1 = __lsx_vxor_v(_in0, _in1); \ 370 _in0 = __lsx_vxor_v(_in0, _in1); \ 371 } 372 373/* 374 * ============================================================================= 375 * Description : Transpose 4x4 block with word elements in vectors 376 * Arguments : Inputs - in0, in1, in2, in3 377 * Outputs - out0, out1, out2, out3 378 * Details : 379 * Example : 380 * 1, 2, 3, 4 1, 5, 9,13 381 * 5, 6, 7, 8 to 2, 6,10,14 382 * 9,10,11,12 =====> 3, 7,11,15 383 * 13,14,15,16 4, 8,12,16 384 * ============================================================================= 385 */ 386#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 387 { \ 388 __m128i _t0, _t1, _t2, _t3; \ 389 \ 390 _t0 = __lsx_vilvl_w(_in1, _in0); \ 391 _t1 = __lsx_vilvh_w(_in1, _in0); \ 392 _t2 = __lsx_vilvl_w(_in3, _in2); \ 393 _t3 = __lsx_vilvh_w(_in3, _in2); \ 394 _out0 = __lsx_vilvl_d(_t2, _t0); \ 395 _out1 = __lsx_vilvh_d(_t2, _t0); \ 396 _out2 = __lsx_vilvl_d(_t3, _t1); \ 397 _out3 = __lsx_vilvh_d(_t3, _t1); \ 398 } 399 400/* 401 * ============================================================================= 402 * Description : Transpose 8x8 block with byte elements in vectors 403 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 404 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 405 * _out7 406 * Details : The rows of the matrix become columns, and the columns 407 * become rows. 408 * Example : LSX_TRANSPOSE8x8_B 409 * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00 410 * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00 411 * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00 412 * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00 413 * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00 414 * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00 415 * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00 416 * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00 417 * 418 * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 419 * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 420 * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 421 * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 422 * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00 423 * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00 424 * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00 425 * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00 426 * ============================================================================= 427 */ 428#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 429 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 430 _out7) \ 431 { \ 432 __m128i zero = { 0 }; \ 433 __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \ 434 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ 435 \ 436 _t0 = __lsx_vilvl_b(_in2, _in0); \ 437 _t1 = __lsx_vilvl_b(_in3, _in1); \ 438 _t2 = __lsx_vilvl_b(_in6, _in4); \ 439 _t3 = __lsx_vilvl_b(_in7, _in5); \ 440 _t4 = __lsx_vilvl_b(_t1, _t0); \ 441 _t5 = __lsx_vilvh_b(_t1, _t0); \ 442 _t6 = __lsx_vilvl_b(_t3, _t2); \ 443 _t7 = __lsx_vilvh_b(_t3, _t2); \ 444 _out0 = __lsx_vilvl_w(_t6, _t4); \ 445 _out2 = __lsx_vilvh_w(_t6, _t4); \ 446 _out4 = __lsx_vilvl_w(_t7, _t5); \ 447 _out6 = __lsx_vilvh_w(_t7, _t5); \ 448 _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \ 449 _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \ 450 _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \ 451 _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \ 452 } 453 454/* 455 * ============================================================================= 456 * Description : Transpose 8x8 block with half-word elements in vectors 457 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 458 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 459 * Details : 460 * Example : 461 * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70 462 * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71 463 * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72 464 * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73 465 * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74 466 * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75 467 * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76 468 * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77 469 * ============================================================================= 470 */ 471#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 472 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 473 _out7) \ 474 { \ 475 __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ 476 \ 477 _s0 = __lsx_vilvl_h(_in6, _in4); \ 478 _s1 = __lsx_vilvl_h(_in7, _in5); \ 479 _t0 = __lsx_vilvl_h(_s1, _s0); \ 480 _t1 = __lsx_vilvh_h(_s1, _s0); \ 481 _s0 = __lsx_vilvh_h(_in6, _in4); \ 482 _s1 = __lsx_vilvh_h(_in7, _in5); \ 483 _t2 = __lsx_vilvl_h(_s1, _s0); \ 484 _t3 = __lsx_vilvh_h(_s1, _s0); \ 485 _s0 = __lsx_vilvl_h(_in2, _in0); \ 486 _s1 = __lsx_vilvl_h(_in3, _in1); \ 487 _t4 = __lsx_vilvl_h(_s1, _s0); \ 488 _t5 = __lsx_vilvh_h(_s1, _s0); \ 489 _s0 = __lsx_vilvh_h(_in2, _in0); \ 490 _s1 = __lsx_vilvh_h(_in3, _in1); \ 491 _t6 = __lsx_vilvl_h(_s1, _s0); \ 492 _t7 = __lsx_vilvh_h(_s1, _s0); \ 493 \ 494 _out0 = __lsx_vpickev_d(_t0, _t4); \ 495 _out2 = __lsx_vpickev_d(_t1, _t5); \ 496 _out4 = __lsx_vpickev_d(_t2, _t6); \ 497 _out6 = __lsx_vpickev_d(_t3, _t7); \ 498 _out1 = __lsx_vpickod_d(_t0, _t4); \ 499 _out3 = __lsx_vpickod_d(_t1, _t5); \ 500 _out5 = __lsx_vpickod_d(_t2, _t6); \ 501 _out7 = __lsx_vpickod_d(_t3, _t7); \ 502 } 503 504/* 505 * ============================================================================= 506 * Description : Transpose input 8x4 byte block into 4x8 507 * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block) 508 * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block) 509 * Return Type - as per RTYPE 510 * Details : The rows of the matrix become columns, and the columns become 511 * rows. 512 * Example : LSX_TRANSPOSE8x4_B 513 * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00 514 * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00 515 * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00 516 * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00 517 * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00 518 * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00 519 * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00 520 * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00 521 * 522 * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 523 * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 524 * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 525 * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 526 * ============================================================================= 527 */ 528#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 529 _out0, _out1, _out2, _out3) \ 530 { \ 531 __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 532 \ 533 _tmp0_m = __lsx_vpackev_w(_in4, _in0); \ 534 _tmp1_m = __lsx_vpackev_w(_in5, _in1); \ 535 _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ 536 _tmp0_m = __lsx_vpackev_w(_in6, _in2); \ 537 _tmp1_m = __lsx_vpackev_w(_in7, _in3); \ 538 \ 539 _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ 540 _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \ 541 _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \ 542 \ 543 _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \ 544 _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \ 545 _out1 = __lsx_vilvh_d(_out2, _out0); \ 546 _out3 = __lsx_vilvh_d(_out0, _out2); \ 547 } 548 549/* 550 * ============================================================================= 551 * Description : Transpose 16x8 block with byte elements in vectors 552 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8 553 * in9, in10, in11, in12, in13, in14, in15 554 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 555 * Details : 556 * Example : 557 * 000,001,002,003,004,005,006,007 558 * 008,009,010,011,012,013,014,015 559 * 016,017,018,019,020,021,022,023 560 * 024,025,026,027,028,029,030,031 561 * 032,033,034,035,036,037,038,039 562 * 040,041,042,043,044,045,046,047 000,008,...,112,120 563 * 048,049,050,051,052,053,054,055 001,009,...,113,121 564 * 056,057,058,059,060,061,062,063 to 002,010,...,114,122 565 * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123 566 * 072,073,074,075,076,077,078,079 004,012,...,116,124 567 * 080,081,082,083,084,085,086,087 005,013,...,117,125 568 * 088,089,090,091,092,093,094,095 006,014,...,118,126 569 * 096,097,098,099,100,101,102,103 007,015,...,119,127 570 * 104,105,106,107,108,109,110,111 571 * 112,113,114,115,116,117,118,119 572 * 120,121,122,123,124,125,126,127 573 * ============================================================================= 574 */ 575#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 576 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ 577 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ 578 _out6, _out7) \ 579 { \ 580 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ 581 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ 582 DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \ 583 _tmp0, _tmp1, _tmp2, _tmp3); \ 584 DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \ 585 _in13, _tmp4, _tmp5, _tmp6, _tmp7); \ 586 DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \ 587 DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \ 588 DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \ 589 DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \ 590 DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \ 591 DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \ 592 DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \ 593 DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \ 594 DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \ 595 DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \ 596 DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \ 597 DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \ 598 } 599 600/* 601 * ============================================================================= 602 * Description : Butterfly of 4 input vectors 603 * Arguments : Inputs - in0, in1, in2, in3 604 * Outputs - out0, out1, out2, out3 605 * Details : Butterfly operation 606 * Example : 607 * out0 = in0 + in3; 608 * out1 = in1 + in2; 609 * out2 = in1 - in2; 610 * out3 = in0 - in3; 611 * ============================================================================= 612 */ 613#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 614 { \ 615 _out0 = __lsx_vadd_b(_in0, _in3); \ 616 _out1 = __lsx_vadd_b(_in1, _in2); \ 617 _out2 = __lsx_vsub_b(_in1, _in2); \ 618 _out3 = __lsx_vsub_b(_in0, _in3); \ 619 } 620#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 621 { \ 622 _out0 = __lsx_vadd_h(_in0, _in3); \ 623 _out1 = __lsx_vadd_h(_in1, _in2); \ 624 _out2 = __lsx_vsub_h(_in1, _in2); \ 625 _out3 = __lsx_vsub_h(_in0, _in3); \ 626 } 627#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 628 { \ 629 _out0 = __lsx_vadd_w(_in0, _in3); \ 630 _out1 = __lsx_vadd_w(_in1, _in2); \ 631 _out2 = __lsx_vsub_w(_in1, _in2); \ 632 _out3 = __lsx_vsub_w(_in0, _in3); \ 633 } 634#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 635 { \ 636 _out0 = __lsx_vadd_d(_in0, _in3); \ 637 _out1 = __lsx_vadd_d(_in1, _in2); \ 638 _out2 = __lsx_vsub_d(_in1, _in2); \ 639 _out3 = __lsx_vsub_d(_in0, _in3); \ 640 } 641 642/* 643 * ============================================================================= 644 * Description : Butterfly of 8 input vectors 645 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ 646 * Outputs - _out0, _out1, _out2, _out3, ~ 647 * Details : Butterfly operation 648 * Example : 649 * _out0 = _in0 + _in7; 650 * _out1 = _in1 + _in6; 651 * _out2 = _in2 + _in5; 652 * _out3 = _in3 + _in4; 653 * _out4 = _in3 - _in4; 654 * _out5 = _in2 - _in5; 655 * _out6 = _in1 - _in6; 656 * _out7 = _in0 - _in7; 657 * ============================================================================= 658 */ 659#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 660 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 661 _out7) \ 662 { \ 663 _out0 = __lsx_vadd_b(_in0, _in7); \ 664 _out1 = __lsx_vadd_b(_in1, _in6); \ 665 _out2 = __lsx_vadd_b(_in2, _in5); \ 666 _out3 = __lsx_vadd_b(_in3, _in4); \ 667 _out4 = __lsx_vsub_b(_in3, _in4); \ 668 _out5 = __lsx_vsub_b(_in2, _in5); \ 669 _out6 = __lsx_vsub_b(_in1, _in6); \ 670 _out7 = __lsx_vsub_b(_in0, _in7); \ 671 } 672 673#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 674 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 675 _out7) \ 676 { \ 677 _out0 = __lsx_vadd_h(_in0, _in7); \ 678 _out1 = __lsx_vadd_h(_in1, _in6); \ 679 _out2 = __lsx_vadd_h(_in2, _in5); \ 680 _out3 = __lsx_vadd_h(_in3, _in4); \ 681 _out4 = __lsx_vsub_h(_in3, _in4); \ 682 _out5 = __lsx_vsub_h(_in2, _in5); \ 683 _out6 = __lsx_vsub_h(_in1, _in6); \ 684 _out7 = __lsx_vsub_h(_in0, _in7); \ 685 } 686 687#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 688 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 689 _out7) \ 690 { \ 691 _out0 = __lsx_vadd_w(_in0, _in7); \ 692 _out1 = __lsx_vadd_w(_in1, _in6); \ 693 _out2 = __lsx_vadd_w(_in2, _in5); \ 694 _out3 = __lsx_vadd_w(_in3, _in4); \ 695 _out4 = __lsx_vsub_w(_in3, _in4); \ 696 _out5 = __lsx_vsub_w(_in2, _in5); \ 697 _out6 = __lsx_vsub_w(_in1, _in6); \ 698 _out7 = __lsx_vsub_w(_in0, _in7); \ 699 } 700 701#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 702 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 703 _out7) \ 704 { \ 705 _out0 = __lsx_vadd_d(_in0, _in7); \ 706 _out1 = __lsx_vadd_d(_in1, _in6); \ 707 _out2 = __lsx_vadd_d(_in2, _in5); \ 708 _out3 = __lsx_vadd_d(_in3, _in4); \ 709 _out4 = __lsx_vsub_d(_in3, _in4); \ 710 _out5 = __lsx_vsub_d(_in2, _in5); \ 711 _out6 = __lsx_vsub_d(_in1, _in6); \ 712 _out7 = __lsx_vsub_d(_in0, _in7); \ 713 } 714 715#endif // LSX 716 717#ifdef __loongarch_asx 718#include <lasxintrin.h> 719/* 720 * ============================================================================= 721 * Description : Dot product of byte vector elements 722 * Arguments : Inputs - in_h, in_l 723 * Output - out 724 * Return Type - signed halfword 725 * Details : Unsigned byte elements from in_h are multiplied with 726 * unsigned byte elements from in_l producing a result 727 * twice the size of input i.e. signed halfword. 728 * Then this multiplied results of adjacent odd-even elements 729 * are added to the out vector 730 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) 731 * ============================================================================= 732 */ 733static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { 734 __m256i out; 735 736 out = __lasx_xvmulwev_h_bu(in_h, in_l); 737 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); 738 return out; 739} 740 741/* 742 * ============================================================================= 743 * Description : Dot product of byte vector elements 744 * Arguments : Inputs - in_h, in_l 745 * Output - out 746 * Return Type - signed halfword 747 * Details : Signed byte elements from in_h are multiplied with 748 * signed byte elements from in_l producing a result 749 * twice the size of input i.e. signed halfword. 750 * Then this multiplication results of adjacent odd-even elements 751 * are added to the out vector 752 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) 753 * ============================================================================= 754 */ 755static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) { 756 __m256i out; 757 758 out = __lasx_xvmulwev_h_b(in_h, in_l); 759 out = __lasx_xvmaddwod_h_b(out, in_h, in_l); 760 return out; 761} 762 763/* 764 * ============================================================================= 765 * Description : Dot product of halfword vector elements 766 * Arguments : Inputs - in_h, in_l 767 * Output - out 768 * Return Type - signed word 769 * Details : Signed halfword elements from in_h are multiplied with 770 * signed halfword elements from in_l producing a result 771 * twice the size of input i.e. signed word. 772 * Then this multiplied results of adjacent odd-even elements 773 * are added to the out vector. 774 * Example : out = __lasx_xvdp2_w_h(in_h, in_l) 775 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 776 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 777 * out : 22,38,38,22, 22,38,38,22 778 * ============================================================================= 779 */ 780static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { 781 __m256i out; 782 783 out = __lasx_xvmulwev_w_h(in_h, in_l); 784 out = __lasx_xvmaddwod_w_h(out, in_h, in_l); 785 return out; 786} 787 788/* 789 * ============================================================================= 790 * Description : Dot product of word vector elements 791 * Arguments : Inputs - in_h, in_l 792 * Output - out 793 * Return Type - signed double 794 * Details : Signed word elements from in_h are multiplied with 795 * signed word elements from in_l producing a result 796 * twice the size of input i.e. signed double-word. 797 * Then this multiplied results of adjacent odd-even elements 798 * are added to the out vector. 799 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) 800 * ============================================================================= 801 */ 802static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) { 803 __m256i out; 804 805 out = __lasx_xvmulwev_d_w(in_h, in_l); 806 out = __lasx_xvmaddwod_d_w(out, in_h, in_l); 807 return out; 808} 809 810/* 811 * ============================================================================= 812 * Description : Dot product of halfword vector elements 813 * Arguments : Inputs - in_h, in_l 814 * Output - out 815 * Return Type - signed word 816 * Details : Unsigned halfword elements from in_h are multiplied with 817 * signed halfword elements from in_l producing a result 818 * twice the size of input i.e. unsigned word. 819 * Multiplication result of adjacent odd-even elements 820 * are added to the out vector 821 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) 822 * ============================================================================= 823 */ 824static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { 825 __m256i out; 826 827 out = __lasx_xvmulwev_w_hu_h(in_h, in_l); 828 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); 829 return out; 830} 831 832/* 833 * ============================================================================= 834 * Description : Dot product & addition of byte vector elements 835 * Arguments : Inputs - in_h, in_l 836 * Output - out 837 * Return Type - halfword 838 * Details : Signed byte elements from in_h are multiplied with 839 * signed byte elements from in_l producing a result 840 * twice the size of input i.e. signed halfword. 841 * Then this multiplied results of adjacent odd-even elements 842 * are added to the in_c vector. 843 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 844 * ============================================================================= 845 */ 846static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, 847 __m256i in_l) { 848 __m256i out; 849 850 out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l); 851 out = __lasx_xvmaddwod_h_b(out, in_h, in_l); 852 return out; 853} 854 855/* 856 * ============================================================================= 857 * Description : Dot product & addition of byte vector elements 858 * Arguments : Inputs - in_h, in_l 859 * Output - out 860 * Return Type - halfword 861 * Details : Unsigned byte elements from in_h are multiplied with 862 * unsigned byte elements from in_l producing a result 863 * twice the size of input i.e. signed halfword. 864 * Then this multiplied results of adjacent odd-even elements 865 * are added to the in_c vector. 866 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 867 * ============================================================================= 868 */ 869static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, 870 __m256i in_l) { 871 __m256i out; 872 873 out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l); 874 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); 875 return out; 876} 877 878/* 879 * ============================================================================= 880 * Description : Dot product & addition of byte vector elements 881 * Arguments : Inputs - in_h, in_l 882 * Output - out 883 * Return Type - halfword 884 * Details : Unsigned byte elements from in_h are multiplied with 885 * signed byte elements from in_l producing a result 886 * twice the size of input i.e. signed halfword. 887 * Then this multiplied results of adjacent odd-even elements 888 * are added to the in_c vector. 889 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 890 * ============================================================================= 891 */ 892static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, 893 __m256i in_l) { 894 __m256i out; 895 896 out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l); 897 out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l); 898 return out; 899} 900 901/* 902 * ============================================================================= 903 * Description : Dot product of halfword vector elements 904 * Arguments : Inputs - in_c, in_h, in_l 905 * Output - out 906 * Return Type - per RTYPE 907 * Details : Signed halfword elements from in_h are multiplied with 908 * signed halfword elements from in_l producing a result 909 * twice the size of input i.e. signed word. 910 * Multiplication result of adjacent odd-even elements 911 * are added to the in_c vector. 912 * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 913 * in_c : 1,2,3,4, 1,2,3,4 914 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8, 915 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1, 916 * out : 23,40,41,26, 23,40,41,26 917 * ============================================================================= 918 */ 919static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, 920 __m256i in_l) { 921 __m256i out; 922 923 out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l); 924 out = __lasx_xvmaddwod_w_h(out, in_h, in_l); 925 return out; 926} 927 928/* 929 * ============================================================================= 930 * Description : Dot product of halfword vector elements 931 * Arguments : Inputs - in_c, in_h, in_l 932 * Output - out 933 * Return Type - signed word 934 * Details : Unsigned halfword elements from in_h are multiplied with 935 * unsigned halfword elements from in_l producing a result 936 * twice the size of input i.e. signed word. 937 * Multiplication result of adjacent odd-even elements 938 * are added to the in_c vector. 939 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 940 * ============================================================================= 941 */ 942static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, 943 __m256i in_l) { 944 __m256i out; 945 946 out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l); 947 out = __lasx_xvmaddwod_w_hu(out, in_h, in_l); 948 return out; 949} 950 951/* 952 * ============================================================================= 953 * Description : Dot product of halfword vector elements 954 * Arguments : Inputs - in_c, in_h, in_l 955 * Output - out 956 * Return Type - signed word 957 * Details : Unsigned halfword elements from in_h are multiplied with 958 * signed halfword elements from in_l producing a result 959 * twice the size of input i.e. signed word. 960 * Multiplication result of adjacent odd-even elements 961 * are added to the in_c vector 962 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 963 * ============================================================================= 964 */ 965static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, 966 __m256i in_l) { 967 __m256i out; 968 969 out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l); 970 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); 971 return out; 972} 973 974/* 975 * ============================================================================= 976 * Description : Vector Unsigned Dot Product and Subtract 977 * Arguments : Inputs - in_c, in_h, in_l 978 * Output - out 979 * Return Type - signed halfword 980 * Details : Unsigned byte elements from in_h are multiplied with 981 * unsigned byte elements from in_l producing a result 982 * twice the size of input i.e. signed halfword. 983 * Multiplication result of adjacent odd-even elements 984 * are added together and subtracted from double width elements 985 * in_c vector. 986 * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) 987 * ============================================================================= 988 */ 989static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, 990 __m256i in_l) { 991 __m256i out; 992 993 out = __lasx_xvmulwev_h_bu(in_h, in_l); 994 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); 995 out = __lasx_xvsub_h(in_c, out); 996 return out; 997} 998 999/* 1000 * ============================================================================= 1001 * Description : Vector Signed Dot Product and Subtract 1002 * Arguments : Inputs - in_c, in_h, in_l 1003 * Output - out 1004 * Return Type - signed word 1005 * Details : Signed halfword elements from in_h are multiplied with 1006 * Signed halfword elements from in_l producing a result 1007 * twice the size of input i.e. signed word. 1008 * Multiplication result of adjacent odd-even elements 1009 * are added together and subtracted from double width elements 1010 * in_c vector. 1011 * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) 1012 * in_c : 0,0,0,0, 0,0,0,0 1013 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1 1014 * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1 1015 * out : -7,-3,0,0, 0,-1,0,-1 1016 * ============================================================================= 1017 */ 1018static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, 1019 __m256i in_l) { 1020 __m256i out; 1021 1022 out = __lasx_xvmulwev_w_h(in_h, in_l); 1023 out = __lasx_xvmaddwod_w_h(out, in_h, in_l); 1024 out = __lasx_xvsub_w(in_c, out); 1025 return out; 1026} 1027 1028/* 1029 * ============================================================================= 1030 * Description : Dot product of halfword vector elements 1031 * Arguments : Inputs - in_h, in_l 1032 * Output - out 1033 * Return Type - signed word 1034 * Details : Signed halfword elements from in_h are multiplied with 1035 * signed halfword elements from in_l producing a result 1036 * four times the size of input i.e. signed doubleword. 1037 * Then this multiplication results of four adjacent elements 1038 * are added together and stored to the out vector. 1039 * Example : out = __lasx_xvdp4_d_h(in_h, in_l) 1040 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1 1041 * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1 1042 * out : -2,0,1,1 1043 * ============================================================================= 1044 */ 1045static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) { 1046 __m256i out; 1047 1048 out = __lasx_xvmulwev_w_h(in_h, in_l); 1049 out = __lasx_xvmaddwod_w_h(out, in_h, in_l); 1050 out = __lasx_xvhaddw_d_w(out, out); 1051 return out; 1052} 1053 1054/* 1055 * ============================================================================= 1056 * Description : The high half of the vector elements are expanded and 1057 * added after being doubled. 1058 * Arguments : Inputs - in_h, in_l 1059 * Output - out 1060 * Details : The in_h vector and the in_l vector are added after the 1061 * higher half of the two-fold sign extension (signed byte 1062 * to signed halfword) and stored to the out vector. 1063 * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l) 1064 * ============================================================================= 1065 */ 1066static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) { 1067 __m256i out; 1068 1069 out = __lasx_xvilvh_b(in_h, in_l); 1070 out = __lasx_xvhaddw_h_b(out, out); 1071 return out; 1072} 1073 1074/* 1075 * ============================================================================= 1076 * Description : The high half of the vector elements are expanded and 1077 * added after being doubled. 1078 * Arguments : Inputs - in_h, in_l 1079 * Output - out 1080 * Details : The in_h vector and the in_l vector are added after the 1081 * higher half of the two-fold sign extension (signed halfword 1082 * to signed word) and stored to the out vector. 1083 * Example : out = __lasx_xvaddwh_w_h(in_h, in_l) 1084 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 1085 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 1086 * out : 1,0,0,-1, 1,0,0, 2 1087 * ============================================================================= 1088 */ 1089static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) { 1090 __m256i out; 1091 1092 out = __lasx_xvilvh_h(in_h, in_l); 1093 out = __lasx_xvhaddw_w_h(out, out); 1094 return out; 1095} 1096 1097/* 1098 * ============================================================================= 1099 * Description : The low half of the vector elements are expanded and 1100 * added after being doubled. 1101 * Arguments : Inputs - in_h, in_l 1102 * Output - out 1103 * Details : The in_h vector and the in_l vector are added after the 1104 * lower half of the two-fold sign extension (signed byte 1105 * to signed halfword) and stored to the out vector. 1106 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) 1107 * ============================================================================= 1108 */ 1109static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) { 1110 __m256i out; 1111 1112 out = __lasx_xvilvl_b(in_h, in_l); 1113 out = __lasx_xvhaddw_h_b(out, out); 1114 return out; 1115} 1116 1117/* 1118 * ============================================================================= 1119 * Description : The low half of the vector elements are expanded and 1120 * added after being doubled. 1121 * Arguments : Inputs - in_h, in_l 1122 * Output - out 1123 * Details : The in_h vector and the in_l vector are added after the 1124 * lower half of the two-fold sign extension (signed halfword 1125 * to signed word) and stored to the out vector. 1126 * Example : out = __lasx_xvaddwl_w_h(in_h, in_l) 1127 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 1128 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 1129 * out : 5,-1,4,2, 1,0,2,-1 1130 * ============================================================================= 1131 */ 1132static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) { 1133 __m256i out; 1134 1135 out = __lasx_xvilvl_h(in_h, in_l); 1136 out = __lasx_xvhaddw_w_h(out, out); 1137 return out; 1138} 1139 1140/* 1141 * ============================================================================= 1142 * Description : The low half of the vector elements are expanded and 1143 * added after being doubled. 1144 * Arguments : Inputs - in_h, in_l 1145 * Output - out 1146 * Details : The out vector and the out vector are added after the 1147 * lower half of the two-fold zero extension (unsigned byte 1148 * to unsigned halfword) and stored to the out vector. 1149 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) 1150 * ============================================================================= 1151 */ 1152static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) { 1153 __m256i out; 1154 1155 out = __lasx_xvilvl_b(in_h, in_l); 1156 out = __lasx_xvhaddw_hu_bu(out, out); 1157 return out; 1158} 1159 1160/* 1161 * ============================================================================= 1162 * Description : The low half of the vector elements are expanded and 1163 * added after being doubled. 1164 * Arguments : Inputs - in_h, in_l 1165 * Output - out 1166 * Details : The in_l vector after double zero extension (unsigned byte to 1167 * signed halfword),added to the in_h vector. 1168 * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l) 1169 * ============================================================================= 1170 */ 1171static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) { 1172 __m256i out; 1173 1174 out = __lasx_xvsllwil_hu_bu(in_l, 0); 1175 out = __lasx_xvadd_h(in_h, out); 1176 return out; 1177} 1178 1179/* 1180 * ============================================================================= 1181 * Description : The low half of the vector elements are expanded and 1182 * added after being doubled. 1183 * Arguments : Inputs - in_h, in_l 1184 * Output - out 1185 * Details : The in_l vector after double sign extension (signed halfword to 1186 * signed word), added to the in_h vector. 1187 * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l) 1188 * in_h : 0, 1,0,0, -1,0,0,1, 1189 * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1, 1190 * out : 2, 0,1,2, -1,0,1,1, 1191 * ============================================================================= 1192 */ 1193static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { 1194 __m256i out; 1195 1196 out = __lasx_xvsllwil_w_h(in_l, 0); 1197 out = __lasx_xvadd_w(in_h, out); 1198 return out; 1199} 1200 1201/* 1202 * ============================================================================= 1203 * Description : Multiplication and addition calculation after expansion 1204 * of the lower half of the vector. 1205 * Arguments : Inputs - in_c, in_h, in_l 1206 * Output - out 1207 * Details : The in_h vector and the in_l vector are multiplied after 1208 * the lower half of the two-fold sign extension (signed halfword 1209 * to signed word), and the result is added to the vector in_c, 1210 * then stored to the out vector. 1211 * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) 1212 * in_c : 1,2,3,4, 5,6,7,8 1213 * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8 1214 * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000, 1215 * -200,-300,-400,-500, -2000,-3000,-4000,-5000 1216 * out : 201, 602,1203,2004, -995, -1794,-2793,-3992 1217 * ============================================================================= 1218 */ 1219static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, 1220 __m256i in_l) { 1221 __m256i tmp0, tmp1, out; 1222 1223 tmp0 = __lasx_xvsllwil_w_h(in_h, 0); 1224 tmp1 = __lasx_xvsllwil_w_h(in_l, 0); 1225 tmp0 = __lasx_xvmul_w(tmp0, tmp1); 1226 out = __lasx_xvadd_w(tmp0, in_c); 1227 return out; 1228} 1229 1230/* 1231 * ============================================================================= 1232 * Description : Multiplication and addition calculation after expansion 1233 * of the higher half of the vector. 1234 * Arguments : Inputs - in_c, in_h, in_l 1235 * Output - out 1236 * Details : The in_h vector and the in_l vector are multiplied after 1237 * the higher half of the two-fold sign extension (signed 1238 * halfword to signed word), and the result is added to 1239 * the vector in_c, then stored to the out vector. 1240 * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) 1241 * ============================================================================= 1242 */ 1243static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, 1244 __m256i in_l) { 1245 __m256i tmp0, tmp1, out; 1246 1247 tmp0 = __lasx_xvilvh_h(in_h, in_h); 1248 tmp1 = __lasx_xvilvh_h(in_l, in_l); 1249 tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1); 1250 out = __lasx_xvadd_w(tmp0, in_c); 1251 return out; 1252} 1253 1254/* 1255 * ============================================================================= 1256 * Description : Multiplication calculation after expansion of the lower 1257 * half of the vector. 1258 * Arguments : Inputs - in_h, in_l 1259 * Output - out 1260 * Details : The in_h vector and the in_l vector are multiplied after 1261 * the lower half of the two-fold sign extension (signed 1262 * halfword to signed word), then stored to the out vector. 1263 * Example : out = __lasx_xvmulwl_w_h(in_h, in_l) 1264 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 1265 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 1266 * out : 6,1,3,0, 0,0,1,0 1267 * ============================================================================= 1268 */ 1269static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) { 1270 __m256i tmp0, tmp1, out; 1271 1272 tmp0 = __lasx_xvsllwil_w_h(in_h, 0); 1273 tmp1 = __lasx_xvsllwil_w_h(in_l, 0); 1274 out = __lasx_xvmul_w(tmp0, tmp1); 1275 return out; 1276} 1277 1278/* 1279 * ============================================================================= 1280 * Description : Multiplication calculation after expansion of the lower 1281 * half of the vector. 1282 * Arguments : Inputs - in_h, in_l 1283 * Output - out 1284 * Details : The in_h vector and the in_l vector are multiplied after 1285 * the lower half of the two-fold sign extension (signed 1286 * halfword to signed word), then stored to the out vector. 1287 * Example : out = __lasx_xvmulwh_w_h(in_h, in_l) 1288 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 1289 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 1290 * out : 0,0,0,0, 0,0,0,1 1291 * ============================================================================= 1292 */ 1293static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { 1294 __m256i tmp0, tmp1, out; 1295 1296 tmp0 = __lasx_xvilvh_h(in_h, in_h); 1297 tmp1 = __lasx_xvilvh_h(in_l, in_l); 1298 out = __lasx_xvmulwev_w_h(tmp0, tmp1); 1299 return out; 1300} 1301 1302/* 1303 * ============================================================================= 1304 * Description : The low half of the vector elements are added to the high half 1305 * after being doubled, then saturated. 1306 * Arguments : Inputs - in_h, in_l 1307 * Output - out 1308 * Details : The in_h vector adds the in_l vector after the lower half of 1309 * the two-fold zero extension (unsigned byte to unsigned 1310 * halfword) and then saturated. The results are stored to the out 1311 * vector. 1312 * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l) 1313 * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1 1314 * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 1315 * 0,0,0,1 1316 * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2, 1317 * ============================================================================= 1318 */ 1319static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { 1320 __m256i tmp1, out; 1321 __m256i zero = { 0 }; 1322 1323 tmp1 = __lasx_xvilvl_b(zero, in_l); 1324 out = __lasx_xvsadd_hu(in_h, tmp1); 1325 return out; 1326} 1327 1328/* 1329 * ============================================================================= 1330 * Description : Clip all halfword elements of input vector between min & max 1331 * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) 1332 * Arguments : Inputs - in (input vector) 1333 * - min (min threshold) 1334 * - max (max threshold) 1335 * Outputs - in (output vector with clipped elements) 1336 * Return Type - signed halfword 1337 * Example : out = __lasx_xvclip_h(in, min, max) 1338 * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5 1339 * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 1340 * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9 1341 * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5 1342 * ============================================================================= 1343 */ 1344static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) { 1345 __m256i out; 1346 1347 out = __lasx_xvmax_h(min, in); 1348 out = __lasx_xvmin_h(max, out); 1349 return out; 1350} 1351 1352/* 1353 * ============================================================================= 1354 * Description : Clip all signed halfword elements of input vector 1355 * between 0 & 255 1356 * Arguments : Inputs - in (input vector) 1357 * Outputs - out (output vector with clipped elements) 1358 * Return Type - signed halfword 1359 * Example : See out = __lasx_xvclip255_w(in) 1360 * ============================================================================= 1361 */ 1362static inline __m256i __lasx_xvclip255_h(__m256i in) { 1363 __m256i out; 1364 1365 out = __lasx_xvmaxi_h(in, 0); 1366 out = __lasx_xvsat_hu(out, 7); 1367 return out; 1368} 1369 1370/* 1371 * ============================================================================= 1372 * Description : Clip all signed word elements of input vector 1373 * between 0 & 255 1374 * Arguments : Inputs - in (input vector) 1375 * Output - out (output vector with clipped elements) 1376 * Return Type - signed word 1377 * Example : out = __lasx_xvclip255_w(in) 1378 * in : -8,255,280,249, -8,255,280,249 1379 * out : 0,255,255,249, 0,255,255,249 1380 * ============================================================================= 1381 */ 1382static inline __m256i __lasx_xvclip255_w(__m256i in) { 1383 __m256i out; 1384 1385 out = __lasx_xvmaxi_w(in, 0); 1386 out = __lasx_xvsat_wu(out, 7); 1387 return out; 1388} 1389 1390/* 1391 * ============================================================================= 1392 * Description : Indexed halfword element values are replicated to all 1393 * elements in output vector. If 'idx < 8' use xvsplati_l_*, 1394 * if 'idx >= 8' use xvsplati_h_*. 1395 * Arguments : Inputs - in, idx 1396 * Output - out 1397 * Details : Idx element value from in vector is replicated to all 1398 * elements in out vector. 1399 * Valid index range for halfword operation is 0-7 1400 * Example : out = __lasx_xvsplati_l_h(in, idx) 1401 * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0 1402 * idx : 0x02 1403 * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11 1404 * ============================================================================= 1405 */ 1406static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) { 1407 __m256i out; 1408 1409 out = __lasx_xvpermi_q(in, in, 0x02); 1410 out = __lasx_xvreplve_h(out, idx); 1411 return out; 1412} 1413 1414/* 1415 * ============================================================================= 1416 * Description : Indexed halfword element values are replicated to all 1417 * elements in output vector. If 'idx < 8' use xvsplati_l_*, 1418 * if 'idx >= 8' use xvsplati_h_*. 1419 * Arguments : Inputs - in, idx 1420 * Output - out 1421 * Details : Idx element value from in vector is replicated to all 1422 * elements in out vector. 1423 * Valid index range for halfword operation is 0-7 1424 * Example : out = __lasx_xvsplati_h_h(in, idx) 1425 * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0 1426 * idx : 0x09 1427 * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 1428 * ============================================================================= 1429 */ 1430static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { 1431 __m256i out; 1432 1433 out = __lasx_xvpermi_q(in, in, 0x13); 1434 out = __lasx_xvreplve_h(out, idx); 1435 return out; 1436} 1437 1438/* 1439 * ============================================================================= 1440 * Description : Transpose 4x4 block with double-word elements in vectors 1441 * Arguments : Inputs - _in0, _in1, _in2, _in3 1442 * Outputs - _out0, _out1, _out2, _out3 1443 * Example : LASX_TRANSPOSE4x4_D 1444 * _in0 : 1,2,3,4 1445 * _in1 : 1,2,3,4 1446 * _in2 : 1,2,3,4 1447 * _in3 : 1,2,3,4 1448 * 1449 * _out0 : 1,1,1,1 1450 * _out1 : 2,2,2,2 1451 * _out2 : 3,3,3,3 1452 * _out3 : 4,4,4,4 1453 * ============================================================================= 1454 */ 1455#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ 1456 _out3) \ 1457 { \ 1458 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ 1459 _tmp0 = __lasx_xvilvl_d(_in1, _in0); \ 1460 _tmp1 = __lasx_xvilvh_d(_in1, _in0); \ 1461 _tmp2 = __lasx_xvilvl_d(_in3, _in2); \ 1462 _tmp3 = __lasx_xvilvh_d(_in3, _in2); \ 1463 _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \ 1464 _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \ 1465 _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \ 1466 _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \ 1467 } 1468 1469/* 1470 * ============================================================================= 1471 * Description : Transpose 8x8 block with word elements in vectors 1472 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 1473 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 1474 * _out7 1475 * Example : LASX_TRANSPOSE8x8_W 1476 * _in0 : 1,2,3,4,5,6,7,8 1477 * _in1 : 2,2,3,4,5,6,7,8 1478 * _in2 : 3,2,3,4,5,6,7,8 1479 * _in3 : 4,2,3,4,5,6,7,8 1480 * _in4 : 5,2,3,4,5,6,7,8 1481 * _in5 : 6,2,3,4,5,6,7,8 1482 * _in6 : 7,2,3,4,5,6,7,8 1483 * _in7 : 8,2,3,4,5,6,7,8 1484 * 1485 * _out0 : 1,2,3,4,5,6,7,8 1486 * _out1 : 2,2,2,2,2,2,2,2 1487 * _out2 : 3,3,3,3,3,3,3,3 1488 * _out3 : 4,4,4,4,4,4,4,4 1489 * _out4 : 5,5,5,5,5,5,5,5 1490 * _out5 : 6,6,6,6,6,6,6,6 1491 * _out6 : 7,7,7,7,7,7,7,7 1492 * _out7 : 8,8,8,8,8,8,8,8 1493 * ============================================================================= 1494 */ 1495#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1496 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1497 _out7) \ 1498 { \ 1499 __m256i _s0_m, _s1_m; \ 1500 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1501 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1502 \ 1503 _s0_m = __lasx_xvilvl_w(_in2, _in0); \ 1504 _s1_m = __lasx_xvilvl_w(_in3, _in1); \ 1505 _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1506 _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1507 _s0_m = __lasx_xvilvh_w(_in2, _in0); \ 1508 _s1_m = __lasx_xvilvh_w(_in3, _in1); \ 1509 _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1510 _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1511 _s0_m = __lasx_xvilvl_w(_in6, _in4); \ 1512 _s1_m = __lasx_xvilvl_w(_in7, _in5); \ 1513 _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1514 _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1515 _s0_m = __lasx_xvilvh_w(_in6, _in4); \ 1516 _s1_m = __lasx_xvilvh_w(_in7, _in5); \ 1517 _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1518 _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1519 _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \ 1520 _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \ 1521 _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \ 1522 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \ 1523 _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \ 1524 _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \ 1525 _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \ 1526 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \ 1527 } 1528 1529/* 1530 * ============================================================================= 1531 * Description : Transpose input 16x8 byte block 1532 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, 1533 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 1534 * (input 16x8 byte block) 1535 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 1536 * _out7 (output 8x16 byte block) 1537 * Details : The rows of the matrix become columns, and the columns become 1538 * rows. 1539 * Example : See LASX_TRANSPOSE16x8_H 1540 * ============================================================================= 1541 */ 1542#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1543 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ 1544 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ 1545 _out6, _out7) \ 1546 { \ 1547 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1548 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1549 \ 1550 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ 1551 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ 1552 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ 1553 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ 1554 _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \ 1555 _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \ 1556 _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \ 1557 _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \ 1558 _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ 1559 _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ 1560 _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ 1561 _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ 1562 _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \ 1563 _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \ 1564 _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \ 1565 _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \ 1566 _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \ 1567 _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \ 1568 _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \ 1569 _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \ 1570 _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \ 1571 _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \ 1572 _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \ 1573 _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \ 1574 _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \ 1575 _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \ 1576 _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \ 1577 _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \ 1578 _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \ 1579 _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \ 1580 _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \ 1581 _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \ 1582 } 1583 1584/* 1585 * ============================================================================= 1586 * Description : Transpose input 16x8 byte block 1587 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, 1588 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 1589 * (input 16x8 byte block) 1590 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 1591 * _out7 (output 8x16 byte block) 1592 * Details : The rows of the matrix become columns, and the columns become 1593 * rows. 1594 * Example : LASX_TRANSPOSE16x8_H 1595 * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1596 * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1597 * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1598 * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1599 * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1600 * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1601 * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1602 * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1603 * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1604 * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1605 * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1606 * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1607 * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1608 * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1609 * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1610 * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1611 * 1612 * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6 1613 * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 1614 * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 1615 * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4 1616 * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 1617 * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 1618 * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 1619 * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 1620 * ============================================================================= 1621 */ 1622#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1623 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ 1624 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ 1625 _out6, _out7) \ 1626 { \ 1627 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1628 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1629 __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ 1630 \ 1631 _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \ 1632 _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \ 1633 _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \ 1634 _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \ 1635 _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \ 1636 _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \ 1637 _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \ 1638 _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \ 1639 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ 1640 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ 1641 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ 1642 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ 1643 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ 1644 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ 1645 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ 1646 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ 1647 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ 1648 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ 1649 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ 1650 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ 1651 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ 1652 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ 1653 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ 1654 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ 1655 _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ 1656 _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ 1657 _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ 1658 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ 1659 \ 1660 _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \ 1661 _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \ 1662 _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \ 1663 _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \ 1664 _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \ 1665 _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \ 1666 _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \ 1667 _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \ 1668 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ 1669 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ 1670 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ 1671 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ 1672 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ 1673 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ 1674 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ 1675 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ 1676 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ 1677 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ 1678 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ 1679 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ 1680 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ 1681 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ 1682 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ 1683 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ 1684 _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ 1685 _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ 1686 _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ 1687 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ 1688 } 1689 1690/* 1691 * ============================================================================= 1692 * Description : Transpose 4x4 block with halfword elements in vectors 1693 * Arguments : Inputs - _in0, _in1, _in2, _in3 1694 * Outputs - _out0, _out1, _out2, _out3 1695 * Return Type - signed halfword 1696 * Details : The rows of the matrix become columns, and the columns become 1697 * rows. 1698 * Example : See LASX_TRANSPOSE8x8_H 1699 * ============================================================================= 1700 */ 1701#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ 1702 _out3) \ 1703 { \ 1704 __m256i _s0_m, _s1_m; \ 1705 \ 1706 _s0_m = __lasx_xvilvl_h(_in1, _in0); \ 1707 _s1_m = __lasx_xvilvl_h(_in3, _in2); \ 1708 _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1709 _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1710 _out1 = __lasx_xvilvh_d(_out0, _out0); \ 1711 _out3 = __lasx_xvilvh_d(_out2, _out2); \ 1712 } 1713 1714/* 1715 * ============================================================================= 1716 * Description : Transpose input 8x8 byte block 1717 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 1718 * (input 8x8 byte block) 1719 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 1720 * _out7 (output 8x8 byte block) 1721 * Example : See LASX_TRANSPOSE8x8_H 1722 * ============================================================================= 1723 */ 1724#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1725 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1726 _out7) \ 1727 { \ 1728 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1729 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1730 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ 1731 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ 1732 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ 1733 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ 1734 _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ 1735 _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ 1736 _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ 1737 _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ 1738 _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \ 1739 _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \ 1740 _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \ 1741 _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \ 1742 _out1 = __lasx_xvbsrl_v(_out0, 8); \ 1743 _out3 = __lasx_xvbsrl_v(_out2, 8); \ 1744 _out5 = __lasx_xvbsrl_v(_out4, 8); \ 1745 _out7 = __lasx_xvbsrl_v(_out6, 8); \ 1746 } 1747 1748/* 1749 * ============================================================================= 1750 * Description : Transpose 8x8 block with halfword elements in vectors. 1751 * Arguments : Inputs - _in0, _in1, ~ 1752 * Outputs - _out0, _out1, ~ 1753 * Details : The rows of the matrix become columns, and the columns become 1754 * rows. 1755 * Example : LASX_TRANSPOSE8x8_H 1756 * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 1757 * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 1758 * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 1759 * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 1760 * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 1761 * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 1762 * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 1763 * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 1764 * 1765 * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9 1766 * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 1767 * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3 1768 * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4 1769 * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5 1770 * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6 1771 * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7 1772 * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8 1773 * ============================================================================= 1774 */ 1775#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1776 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1777 _out7) \ 1778 { \ 1779 __m256i _s0_m, _s1_m; \ 1780 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1781 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1782 \ 1783 _s0_m = __lasx_xvilvl_h(_in6, _in4); \ 1784 _s1_m = __lasx_xvilvl_h(_in7, _in5); \ 1785 _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ 1786 _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ 1787 _s0_m = __lasx_xvilvh_h(_in6, _in4); \ 1788 _s1_m = __lasx_xvilvh_h(_in7, _in5); \ 1789 _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ 1790 _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ 1791 \ 1792 _s0_m = __lasx_xvilvl_h(_in2, _in0); \ 1793 _s1_m = __lasx_xvilvl_h(_in3, _in1); \ 1794 _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ 1795 _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ 1796 _s0_m = __lasx_xvilvh_h(_in2, _in0); \ 1797 _s1_m = __lasx_xvilvh_h(_in3, _in1); \ 1798 _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ 1799 _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ 1800 \ 1801 _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \ 1802 _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \ 1803 _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \ 1804 _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \ 1805 _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \ 1806 _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \ 1807 _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \ 1808 _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \ 1809 } 1810 1811/* 1812 * ============================================================================= 1813 * Description : Butterfly of 4 input vectors 1814 * Arguments : Inputs - _in0, _in1, _in2, _in3 1815 * Outputs - _out0, _out1, _out2, _out3 1816 * Details : Butterfly operation 1817 * Example : LASX_BUTTERFLY_4 1818 * _out0 = _in0 + _in3; 1819 * _out1 = _in1 + _in2; 1820 * _out2 = _in1 - _in2; 1821 * _out3 = _in0 - _in3; 1822 * ============================================================================= 1823 */ 1824#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 1825 { \ 1826 _out0 = __lasx_xvadd_b(_in0, _in3); \ 1827 _out1 = __lasx_xvadd_b(_in1, _in2); \ 1828 _out2 = __lasx_xvsub_b(_in1, _in2); \ 1829 _out3 = __lasx_xvsub_b(_in0, _in3); \ 1830 } 1831#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 1832 { \ 1833 _out0 = __lasx_xvadd_h(_in0, _in3); \ 1834 _out1 = __lasx_xvadd_h(_in1, _in2); \ 1835 _out2 = __lasx_xvsub_h(_in1, _in2); \ 1836 _out3 = __lasx_xvsub_h(_in0, _in3); \ 1837 } 1838#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 1839 { \ 1840 _out0 = __lasx_xvadd_w(_in0, _in3); \ 1841 _out1 = __lasx_xvadd_w(_in1, _in2); \ 1842 _out2 = __lasx_xvsub_w(_in1, _in2); \ 1843 _out3 = __lasx_xvsub_w(_in0, _in3); \ 1844 } 1845#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 1846 { \ 1847 _out0 = __lasx_xvadd_d(_in0, _in3); \ 1848 _out1 = __lasx_xvadd_d(_in1, _in2); \ 1849 _out2 = __lasx_xvsub_d(_in1, _in2); \ 1850 _out3 = __lasx_xvsub_d(_in0, _in3); \ 1851 } 1852 1853/* 1854 * ============================================================================= 1855 * Description : Butterfly of 8 input vectors 1856 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ 1857 * Outputs - _out0, _out1, _out2, _out3, ~ 1858 * Details : Butterfly operation 1859 * Example : LASX_BUTTERFLY_8 1860 * _out0 = _in0 + _in7; 1861 * _out1 = _in1 + _in6; 1862 * _out2 = _in2 + _in5; 1863 * _out3 = _in3 + _in4; 1864 * _out4 = _in3 - _in4; 1865 * _out5 = _in2 - _in5; 1866 * _out6 = _in1 - _in6; 1867 * _out7 = _in0 - _in7; 1868 * ============================================================================= 1869 */ 1870#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1871 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1872 _out7) \ 1873 { \ 1874 _out0 = __lasx_xvadd_b(_in0, _in7); \ 1875 _out1 = __lasx_xvadd_b(_in1, _in6); \ 1876 _out2 = __lasx_xvadd_b(_in2, _in5); \ 1877 _out3 = __lasx_xvadd_b(_in3, _in4); \ 1878 _out4 = __lasx_xvsub_b(_in3, _in4); \ 1879 _out5 = __lasx_xvsub_b(_in2, _in5); \ 1880 _out6 = __lasx_xvsub_b(_in1, _in6); \ 1881 _out7 = __lasx_xvsub_b(_in0, _in7); \ 1882 } 1883 1884#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1885 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1886 _out7) \ 1887 { \ 1888 _out0 = __lasx_xvadd_h(_in0, _in7); \ 1889 _out1 = __lasx_xvadd_h(_in1, _in6); \ 1890 _out2 = __lasx_xvadd_h(_in2, _in5); \ 1891 _out3 = __lasx_xvadd_h(_in3, _in4); \ 1892 _out4 = __lasx_xvsub_h(_in3, _in4); \ 1893 _out5 = __lasx_xvsub_h(_in2, _in5); \ 1894 _out6 = __lasx_xvsub_h(_in1, _in6); \ 1895 _out7 = __lasx_xvsub_h(_in0, _in7); \ 1896 } 1897 1898#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1899 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1900 _out7) \ 1901 { \ 1902 _out0 = __lasx_xvadd_w(_in0, _in7); \ 1903 _out1 = __lasx_xvadd_w(_in1, _in6); \ 1904 _out2 = __lasx_xvadd_w(_in2, _in5); \ 1905 _out3 = __lasx_xvadd_w(_in3, _in4); \ 1906 _out4 = __lasx_xvsub_w(_in3, _in4); \ 1907 _out5 = __lasx_xvsub_w(_in2, _in5); \ 1908 _out6 = __lasx_xvsub_w(_in1, _in6); \ 1909 _out7 = __lasx_xvsub_w(_in0, _in7); \ 1910 } 1911 1912#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1913 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1914 _out7) \ 1915 { \ 1916 _out0 = __lasx_xvadd_d(_in0, _in7); \ 1917 _out1 = __lasx_xvadd_d(_in1, _in6); \ 1918 _out2 = __lasx_xvadd_d(_in2, _in5); \ 1919 _out3 = __lasx_xvadd_d(_in3, _in4); \ 1920 _out4 = __lasx_xvsub_d(_in3, _in4); \ 1921 _out5 = __lasx_xvsub_d(_in2, _in5); \ 1922 _out6 = __lasx_xvsub_d(_in1, _in6); \ 1923 _out7 = __lasx_xvsub_d(_in0, _in7); \ 1924 } 1925 1926#endif // LASX 1927 1928/* 1929 * ============================================================================= 1930 * Description : Print out elements in vector. 1931 * Arguments : Inputs - RTYPE, _element_num, _in0, _enter 1932 * Outputs - 1933 * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if 1934 * '_enter' is TRUE, prefix "\nVP:" will be added first. 1935 * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4 1936 * VP:1,2,3,4, 1937 * ============================================================================= 1938 */ 1939#define VECT_PRINT(RTYPE, element_num, in0, enter) \ 1940 { \ 1941 RTYPE _tmp0 = (RTYPE)in0; \ 1942 int _i = 0; \ 1943 if (enter) printf("\nVP:"); \ 1944 for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \ 1945 } 1946 1947#endif /* LOONGSON_INTRINSICS_H */ 1948#endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */ 1949