1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited 3cabdff1aSopenharmony_ci * All rights reserved. 4cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 5cabdff1aSopenharmony_ci * Xiwei Gu <guxiwei-hf@loongson.cn> 6cabdff1aSopenharmony_ci * Lu Wang <wanglu@loongson.cn> 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * This file is part of FFmpeg. 9cabdff1aSopenharmony_ci * 10cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci * 15cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci * Lesser General Public License for more details. 19cabdff1aSopenharmony_ci * 20cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci * 24cabdff1aSopenharmony_ci */ 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci#ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H 27cabdff1aSopenharmony_ci#define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci/* 30cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited 31cabdff1aSopenharmony_ci * All rights reserved. 32cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 33cabdff1aSopenharmony_ci * Xiwei Gu <guxiwei-hf@loongson.cn> 34cabdff1aSopenharmony_ci * Lu Wang <wanglu@loongson.cn> 35cabdff1aSopenharmony_ci * 36cabdff1aSopenharmony_ci * This file is a header file for loongarch builtin extension. 37cabdff1aSopenharmony_ci * 38cabdff1aSopenharmony_ci */ 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci#ifndef LOONGSON_INTRINSICS_H 41cabdff1aSopenharmony_ci#define LOONGSON_INTRINSICS_H 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ci/** 44cabdff1aSopenharmony_ci * MAJOR version: Macro usage changes. 45cabdff1aSopenharmony_ci * MINOR version: Add new functions, or bug fixes. 46cabdff1aSopenharmony_ci * MICRO version: Comment changes or implementation changes. 47cabdff1aSopenharmony_ci */ 48cabdff1aSopenharmony_ci#define LSOM_VERSION_MAJOR 1 49cabdff1aSopenharmony_ci#define LSOM_VERSION_MINOR 1 50cabdff1aSopenharmony_ci#define LSOM_VERSION_MICRO 0 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ 53cabdff1aSopenharmony_ci { \ 54cabdff1aSopenharmony_ci _OUT0 = _INS(_IN0); \ 55cabdff1aSopenharmony_ci _OUT1 = _INS(_IN1); \ 56cabdff1aSopenharmony_ci } 57cabdff1aSopenharmony_ci 58cabdff1aSopenharmony_ci#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \ 59cabdff1aSopenharmony_ci { \ 60cabdff1aSopenharmony_ci _OUT0 = _INS(_IN0, _IN1); \ 61cabdff1aSopenharmony_ci _OUT1 = _INS(_IN2, _IN3); \ 62cabdff1aSopenharmony_ci } 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \ 65cabdff1aSopenharmony_ci { \ 66cabdff1aSopenharmony_ci _OUT0 = _INS(_IN0, _IN1, _IN2); \ 67cabdff1aSopenharmony_ci _OUT1 = _INS(_IN3, _IN4, _IN5); \ 68cabdff1aSopenharmony_ci } 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_ci#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \ 71cabdff1aSopenharmony_ci { \ 72cabdff1aSopenharmony_ci DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \ 73cabdff1aSopenharmony_ci DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \ 74cabdff1aSopenharmony_ci } 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \ 77cabdff1aSopenharmony_ci _OUT1, _OUT2, _OUT3) \ 78cabdff1aSopenharmony_ci { \ 79cabdff1aSopenharmony_ci DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \ 80cabdff1aSopenharmony_ci DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \ 81cabdff1aSopenharmony_ci } 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_ci#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \ 84cabdff1aSopenharmony_ci _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \ 85cabdff1aSopenharmony_ci { \ 86cabdff1aSopenharmony_ci DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \ 87cabdff1aSopenharmony_ci DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \ 88cabdff1aSopenharmony_ci } 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci#ifdef __loongarch_sx 91cabdff1aSopenharmony_ci#include <lsxintrin.h> 92cabdff1aSopenharmony_ci/* 93cabdff1aSopenharmony_ci * ============================================================================= 94cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements 95cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 96cabdff1aSopenharmony_ci * Outputs - out 97cabdff1aSopenharmony_ci * Return Type - halfword 98cabdff1aSopenharmony_ci * Details : Signed byte elements from in_h are multiplied by 99cabdff1aSopenharmony_ci * signed byte elements from in_l, and then added adjacent to 100cabdff1aSopenharmony_ci * each other to get results with the twice size of input. 101cabdff1aSopenharmony_ci * Then the results plus to signed half-word elements from in_c. 102cabdff1aSopenharmony_ci * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) 103cabdff1aSopenharmony_ci * in_c : 1,2,3,4, 1,2,3,4 104cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 105cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 106cabdff1aSopenharmony_ci * out : 23,40,41,26, 23,40,41,26 107cabdff1aSopenharmony_ci * ============================================================================= 108cabdff1aSopenharmony_ci */ 109cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, 110cabdff1aSopenharmony_ci __m128i in_l) { 111cabdff1aSopenharmony_ci __m128i out; 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci out = __lsx_vmaddwev_h_b(in_c, in_h, in_l); 114cabdff1aSopenharmony_ci out = __lsx_vmaddwod_h_b(out, in_h, in_l); 115cabdff1aSopenharmony_ci return out; 116cabdff1aSopenharmony_ci} 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ci/* 119cabdff1aSopenharmony_ci * ============================================================================= 120cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements 121cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 122cabdff1aSopenharmony_ci * Outputs - out 123cabdff1aSopenharmony_ci * Return Type - halfword 124cabdff1aSopenharmony_ci * Details : Unsigned byte elements from in_h are multiplied by 125cabdff1aSopenharmony_ci * unsigned byte elements from in_l, and then added adjacent to 126cabdff1aSopenharmony_ci * each other to get results with the twice size of input. 127cabdff1aSopenharmony_ci * The results plus to signed half-word elements from in_c. 128cabdff1aSopenharmony_ci * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l) 129cabdff1aSopenharmony_ci * in_c : 1,2,3,4, 1,2,3,4 130cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 131cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 132cabdff1aSopenharmony_ci * out : 23,40,41,26, 23,40,41,26 133cabdff1aSopenharmony_ci * ============================================================================= 134cabdff1aSopenharmony_ci */ 135cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, 136cabdff1aSopenharmony_ci __m128i in_l) { 137cabdff1aSopenharmony_ci __m128i out; 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l); 140cabdff1aSopenharmony_ci out = __lsx_vmaddwod_h_bu(out, in_h, in_l); 141cabdff1aSopenharmony_ci return out; 142cabdff1aSopenharmony_ci} 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci/* 145cabdff1aSopenharmony_ci * ============================================================================= 146cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements 147cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 148cabdff1aSopenharmony_ci * Outputs - out 149cabdff1aSopenharmony_ci * Return Type - halfword 150cabdff1aSopenharmony_ci * Details : Unsigned byte elements from in_h are multiplied by 151cabdff1aSopenharmony_ci * signed byte elements from in_l, and then added adjacent to 152cabdff1aSopenharmony_ci * each other to get results with the twice size of input. 153cabdff1aSopenharmony_ci * The results plus to signed half-word elements from in_c. 154cabdff1aSopenharmony_ci * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l) 155cabdff1aSopenharmony_ci * in_c : 1,1,1,1, 1,1,1,1 156cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 157cabdff1aSopenharmony_ci * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8 158cabdff1aSopenharmony_ci * out : -4,-24,-60,-112, 6,26,62,114 159cabdff1aSopenharmony_ci * ============================================================================= 160cabdff1aSopenharmony_ci */ 161cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, 162cabdff1aSopenharmony_ci __m128i in_l) { 163cabdff1aSopenharmony_ci __m128i out; 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l); 166cabdff1aSopenharmony_ci out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); 167cabdff1aSopenharmony_ci return out; 168cabdff1aSopenharmony_ci} 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_ci/* 171cabdff1aSopenharmony_ci * ============================================================================= 172cabdff1aSopenharmony_ci * Description : Dot product & addition of half-word vector elements 173cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 174cabdff1aSopenharmony_ci * Outputs - out 175cabdff1aSopenharmony_ci * Return Type - __m128i 176cabdff1aSopenharmony_ci * Details : Signed half-word elements from in_h are multiplied by 177cabdff1aSopenharmony_ci * signed half-word elements from in_l, and then added adjacent to 178cabdff1aSopenharmony_ci * each other to get results with the twice size of input. 179cabdff1aSopenharmony_ci * Then the results plus to signed word elements from in_c. 180cabdff1aSopenharmony_ci * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) 181cabdff1aSopenharmony_ci * in_c : 1,2,3,4 182cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8 183cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1 184cabdff1aSopenharmony_ci * out : 23,40,41,26 185cabdff1aSopenharmony_ci * ============================================================================= 186cabdff1aSopenharmony_ci */ 187cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, 188cabdff1aSopenharmony_ci __m128i in_l) { 189cabdff1aSopenharmony_ci __m128i out; 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ci out = __lsx_vmaddwev_w_h(in_c, in_h, in_l); 192cabdff1aSopenharmony_ci out = __lsx_vmaddwod_w_h(out, in_h, in_l); 193cabdff1aSopenharmony_ci return out; 194cabdff1aSopenharmony_ci} 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci/* 197cabdff1aSopenharmony_ci * ============================================================================= 198cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements 199cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 200cabdff1aSopenharmony_ci * Outputs - out 201cabdff1aSopenharmony_ci * Return Type - halfword 202cabdff1aSopenharmony_ci * Details : Signed byte elements from in_h are multiplied by 203cabdff1aSopenharmony_ci * signed byte elements from in_l, and then added adjacent to 204cabdff1aSopenharmony_ci * each other to get results with the twice size of input. 205cabdff1aSopenharmony_ci * Example : out = __lsx_vdp2_h_b(in_h, in_l) 206cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 207cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 208cabdff1aSopenharmony_ci * out : 22,38,38,22, 22,38,38,22 209cabdff1aSopenharmony_ci * ============================================================================= 210cabdff1aSopenharmony_ci */ 211cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { 212cabdff1aSopenharmony_ci __m128i out; 213cabdff1aSopenharmony_ci 214cabdff1aSopenharmony_ci out = __lsx_vmulwev_h_b(in_h, in_l); 215cabdff1aSopenharmony_ci out = __lsx_vmaddwod_h_b(out, in_h, in_l); 216cabdff1aSopenharmony_ci return out; 217cabdff1aSopenharmony_ci} 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_ci/* 220cabdff1aSopenharmony_ci * ============================================================================= 221cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements 222cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 223cabdff1aSopenharmony_ci * Outputs - out 224cabdff1aSopenharmony_ci * Return Type - halfword 225cabdff1aSopenharmony_ci * Details : Unsigned byte elements from in_h are multiplied by 226cabdff1aSopenharmony_ci * unsigned byte elements from in_l, and then added adjacent to 227cabdff1aSopenharmony_ci * each other to get results with the twice size of input. 228cabdff1aSopenharmony_ci * Example : out = __lsx_vdp2_h_bu(in_h, in_l) 229cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 230cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 231cabdff1aSopenharmony_ci * out : 22,38,38,22, 22,38,38,22 232cabdff1aSopenharmony_ci * ============================================================================= 233cabdff1aSopenharmony_ci */ 234cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { 235cabdff1aSopenharmony_ci __m128i out; 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci out = __lsx_vmulwev_h_bu(in_h, in_l); 238cabdff1aSopenharmony_ci out = __lsx_vmaddwod_h_bu(out, in_h, in_l); 239cabdff1aSopenharmony_ci return out; 240cabdff1aSopenharmony_ci} 241cabdff1aSopenharmony_ci 242cabdff1aSopenharmony_ci/* 243cabdff1aSopenharmony_ci * ============================================================================= 244cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements 245cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 246cabdff1aSopenharmony_ci * Outputs - out 247cabdff1aSopenharmony_ci * Return Type - halfword 248cabdff1aSopenharmony_ci * Details : Unsigned byte elements from in_h are multiplied by 249cabdff1aSopenharmony_ci * signed byte elements from in_l, and then added adjacent to 250cabdff1aSopenharmony_ci * each other to get results with the twice size of input. 251cabdff1aSopenharmony_ci * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l) 252cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 253cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1 254cabdff1aSopenharmony_ci * out : 22,38,38,22, 22,38,38,6 255cabdff1aSopenharmony_ci * ============================================================================= 256cabdff1aSopenharmony_ci */ 257cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { 258cabdff1aSopenharmony_ci __m128i out; 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci out = __lsx_vmulwev_h_bu_b(in_h, in_l); 261cabdff1aSopenharmony_ci out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); 262cabdff1aSopenharmony_ci return out; 263cabdff1aSopenharmony_ci} 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ci/* 266cabdff1aSopenharmony_ci * ============================================================================= 267cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements 268cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 269cabdff1aSopenharmony_ci * Outputs - out 270cabdff1aSopenharmony_ci * Return Type - halfword 271cabdff1aSopenharmony_ci * Details : Signed byte elements from in_h are multiplied by 272cabdff1aSopenharmony_ci * signed byte elements from in_l, and then added adjacent to 273cabdff1aSopenharmony_ci * each other to get results with the twice size of input. 274cabdff1aSopenharmony_ci * Example : out = __lsx_vdp2_w_h(in_h, in_l) 275cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8 276cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1 277cabdff1aSopenharmony_ci * out : 22,38,38,22 278cabdff1aSopenharmony_ci * ============================================================================= 279cabdff1aSopenharmony_ci */ 280cabdff1aSopenharmony_cistatic inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { 281cabdff1aSopenharmony_ci __m128i out; 282cabdff1aSopenharmony_ci 283cabdff1aSopenharmony_ci out = __lsx_vmulwev_w_h(in_h, in_l); 284cabdff1aSopenharmony_ci out = __lsx_vmaddwod_w_h(out, in_h, in_l); 285cabdff1aSopenharmony_ci return out; 286cabdff1aSopenharmony_ci} 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci/* 289cabdff1aSopenharmony_ci * ============================================================================= 290cabdff1aSopenharmony_ci * Description : Clip all halfword elements of input vector between min & max 291cabdff1aSopenharmony_ci * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : 292cabdff1aSopenharmony_ci * (_in)) 293cabdff1aSopenharmony_ci * Arguments : Inputs - _in (input vector) 294cabdff1aSopenharmony_ci * - min (min threshold) 295cabdff1aSopenharmony_ci * - max (max threshold) 296cabdff1aSopenharmony_ci * Outputs - out (output vector with clipped elements) 297cabdff1aSopenharmony_ci * Return Type - signed halfword 298cabdff1aSopenharmony_ci * Example : out = __lsx_vclip_h(_in) 299cabdff1aSopenharmony_ci * _in : -8,2,280,249, -8,255,280,249 300cabdff1aSopenharmony_ci * min : 1,1,1,1, 1,1,1,1 301cabdff1aSopenharmony_ci * max : 9,9,9,9, 9,9,9,9 302cabdff1aSopenharmony_ci * out : 1,2,9,9, 1,9,9,9 303cabdff1aSopenharmony_ci * ============================================================================= 304cabdff1aSopenharmony_ci */ 305cabdff1aSopenharmony_cistatic inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) { 306cabdff1aSopenharmony_ci __m128i out; 307cabdff1aSopenharmony_ci 308cabdff1aSopenharmony_ci out = __lsx_vmax_h(min, _in); 309cabdff1aSopenharmony_ci out = __lsx_vmin_h(max, out); 310cabdff1aSopenharmony_ci return out; 311cabdff1aSopenharmony_ci} 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci/* 314cabdff1aSopenharmony_ci * ============================================================================= 315cabdff1aSopenharmony_ci * Description : Set each element of vector between 0 and 255 316cabdff1aSopenharmony_ci * Arguments : Inputs - _in 317cabdff1aSopenharmony_ci * Outputs - out 318cabdff1aSopenharmony_ci * Return Type - halfword 319cabdff1aSopenharmony_ci * Details : Signed byte elements from _in are clamped between 0 and 255. 320cabdff1aSopenharmony_ci * Example : out = __lsx_vclip255_h(_in) 321cabdff1aSopenharmony_ci * _in : -8,255,280,249, -8,255,280,249 322cabdff1aSopenharmony_ci * out : 0,255,255,249, 0,255,255,249 323cabdff1aSopenharmony_ci * ============================================================================= 324cabdff1aSopenharmony_ci */ 325cabdff1aSopenharmony_cistatic inline __m128i __lsx_vclip255_h(__m128i _in) { 326cabdff1aSopenharmony_ci __m128i out; 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci out = __lsx_vmaxi_h(_in, 0); 329cabdff1aSopenharmony_ci out = __lsx_vsat_hu(out, 7); 330cabdff1aSopenharmony_ci return out; 331cabdff1aSopenharmony_ci} 332cabdff1aSopenharmony_ci 333cabdff1aSopenharmony_ci/* 334cabdff1aSopenharmony_ci * ============================================================================= 335cabdff1aSopenharmony_ci * Description : Set each element of vector between 0 and 255 336cabdff1aSopenharmony_ci * Arguments : Inputs - _in 337cabdff1aSopenharmony_ci * Outputs - out 338cabdff1aSopenharmony_ci * Return Type - word 339cabdff1aSopenharmony_ci * Details : Signed byte elements from _in are clamped between 0 and 255. 340cabdff1aSopenharmony_ci * Example : out = __lsx_vclip255_w(_in) 341cabdff1aSopenharmony_ci * _in : -8,255,280,249 342cabdff1aSopenharmony_ci * out : 0,255,255,249 343cabdff1aSopenharmony_ci * ============================================================================= 344cabdff1aSopenharmony_ci */ 345cabdff1aSopenharmony_cistatic inline __m128i __lsx_vclip255_w(__m128i _in) { 346cabdff1aSopenharmony_ci __m128i out; 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci out = __lsx_vmaxi_w(_in, 0); 349cabdff1aSopenharmony_ci out = __lsx_vsat_wu(out, 7); 350cabdff1aSopenharmony_ci return out; 351cabdff1aSopenharmony_ci} 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci/* 354cabdff1aSopenharmony_ci * ============================================================================= 355cabdff1aSopenharmony_ci * Description : Swap two variables 356cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1 357cabdff1aSopenharmony_ci * Outputs - _in0, _in1 (in-place) 358cabdff1aSopenharmony_ci * Details : Swapping of two input variables using xor 359cabdff1aSopenharmony_ci * Example : LSX_SWAP(_in0, _in1) 360cabdff1aSopenharmony_ci * _in0 : 1,2,3,4 361cabdff1aSopenharmony_ci * _in1 : 5,6,7,8 362cabdff1aSopenharmony_ci * _in0(out) : 5,6,7,8 363cabdff1aSopenharmony_ci * _in1(out) : 1,2,3,4 364cabdff1aSopenharmony_ci * ============================================================================= 365cabdff1aSopenharmony_ci */ 366cabdff1aSopenharmony_ci#define LSX_SWAP(_in0, _in1) \ 367cabdff1aSopenharmony_ci { \ 368cabdff1aSopenharmony_ci _in0 = __lsx_vxor_v(_in0, _in1); \ 369cabdff1aSopenharmony_ci _in1 = __lsx_vxor_v(_in0, _in1); \ 370cabdff1aSopenharmony_ci _in0 = __lsx_vxor_v(_in0, _in1); \ 371cabdff1aSopenharmony_ci } 372cabdff1aSopenharmony_ci 373cabdff1aSopenharmony_ci/* 374cabdff1aSopenharmony_ci * ============================================================================= 375cabdff1aSopenharmony_ci * Description : Transpose 4x4 block with word elements in vectors 376cabdff1aSopenharmony_ci * Arguments : Inputs - in0, in1, in2, in3 377cabdff1aSopenharmony_ci * Outputs - out0, out1, out2, out3 378cabdff1aSopenharmony_ci * Details : 379cabdff1aSopenharmony_ci * Example : 380cabdff1aSopenharmony_ci * 1, 2, 3, 4 1, 5, 9,13 381cabdff1aSopenharmony_ci * 5, 6, 7, 8 to 2, 6,10,14 382cabdff1aSopenharmony_ci * 9,10,11,12 =====> 3, 7,11,15 383cabdff1aSopenharmony_ci * 13,14,15,16 4, 8,12,16 384cabdff1aSopenharmony_ci * ============================================================================= 385cabdff1aSopenharmony_ci */ 386cabdff1aSopenharmony_ci#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 387cabdff1aSopenharmony_ci { \ 388cabdff1aSopenharmony_ci __m128i _t0, _t1, _t2, _t3; \ 389cabdff1aSopenharmony_ci \ 390cabdff1aSopenharmony_ci _t0 = __lsx_vilvl_w(_in1, _in0); \ 391cabdff1aSopenharmony_ci _t1 = __lsx_vilvh_w(_in1, _in0); \ 392cabdff1aSopenharmony_ci _t2 = __lsx_vilvl_w(_in3, _in2); \ 393cabdff1aSopenharmony_ci _t3 = __lsx_vilvh_w(_in3, _in2); \ 394cabdff1aSopenharmony_ci _out0 = __lsx_vilvl_d(_t2, _t0); \ 395cabdff1aSopenharmony_ci _out1 = __lsx_vilvh_d(_t2, _t0); \ 396cabdff1aSopenharmony_ci _out2 = __lsx_vilvl_d(_t3, _t1); \ 397cabdff1aSopenharmony_ci _out3 = __lsx_vilvh_d(_t3, _t1); \ 398cabdff1aSopenharmony_ci } 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_ci/* 401cabdff1aSopenharmony_ci * ============================================================================= 402cabdff1aSopenharmony_ci * Description : Transpose 8x8 block with byte elements in vectors 403cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 404cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 405cabdff1aSopenharmony_ci * _out7 406cabdff1aSopenharmony_ci * Details : The rows of the matrix become columns, and the columns 407cabdff1aSopenharmony_ci * become rows. 408cabdff1aSopenharmony_ci * Example : LSX_TRANSPOSE8x8_B 409cabdff1aSopenharmony_ci * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00 410cabdff1aSopenharmony_ci * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00 411cabdff1aSopenharmony_ci * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00 412cabdff1aSopenharmony_ci * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00 413cabdff1aSopenharmony_ci * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00 414cabdff1aSopenharmony_ci * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00 415cabdff1aSopenharmony_ci * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00 416cabdff1aSopenharmony_ci * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00 417cabdff1aSopenharmony_ci * 418cabdff1aSopenharmony_ci * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 419cabdff1aSopenharmony_ci * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 420cabdff1aSopenharmony_ci * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 421cabdff1aSopenharmony_ci * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 422cabdff1aSopenharmony_ci * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00 423cabdff1aSopenharmony_ci * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00 424cabdff1aSopenharmony_ci * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00 425cabdff1aSopenharmony_ci * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00 426cabdff1aSopenharmony_ci * ============================================================================= 427cabdff1aSopenharmony_ci */ 428cabdff1aSopenharmony_ci#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 429cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 430cabdff1aSopenharmony_ci _out7) \ 431cabdff1aSopenharmony_ci { \ 432cabdff1aSopenharmony_ci __m128i zero = { 0 }; \ 433cabdff1aSopenharmony_ci __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \ 434cabdff1aSopenharmony_ci __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ 435cabdff1aSopenharmony_ci \ 436cabdff1aSopenharmony_ci _t0 = __lsx_vilvl_b(_in2, _in0); \ 437cabdff1aSopenharmony_ci _t1 = __lsx_vilvl_b(_in3, _in1); \ 438cabdff1aSopenharmony_ci _t2 = __lsx_vilvl_b(_in6, _in4); \ 439cabdff1aSopenharmony_ci _t3 = __lsx_vilvl_b(_in7, _in5); \ 440cabdff1aSopenharmony_ci _t4 = __lsx_vilvl_b(_t1, _t0); \ 441cabdff1aSopenharmony_ci _t5 = __lsx_vilvh_b(_t1, _t0); \ 442cabdff1aSopenharmony_ci _t6 = __lsx_vilvl_b(_t3, _t2); \ 443cabdff1aSopenharmony_ci _t7 = __lsx_vilvh_b(_t3, _t2); \ 444cabdff1aSopenharmony_ci _out0 = __lsx_vilvl_w(_t6, _t4); \ 445cabdff1aSopenharmony_ci _out2 = __lsx_vilvh_w(_t6, _t4); \ 446cabdff1aSopenharmony_ci _out4 = __lsx_vilvl_w(_t7, _t5); \ 447cabdff1aSopenharmony_ci _out6 = __lsx_vilvh_w(_t7, _t5); \ 448cabdff1aSopenharmony_ci _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \ 449cabdff1aSopenharmony_ci _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \ 450cabdff1aSopenharmony_ci _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \ 451cabdff1aSopenharmony_ci _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \ 452cabdff1aSopenharmony_ci } 453cabdff1aSopenharmony_ci 454cabdff1aSopenharmony_ci/* 455cabdff1aSopenharmony_ci * ============================================================================= 456cabdff1aSopenharmony_ci * Description : Transpose 8x8 block with half-word elements in vectors 457cabdff1aSopenharmony_ci * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 458cabdff1aSopenharmony_ci * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 459cabdff1aSopenharmony_ci * Details : 460cabdff1aSopenharmony_ci * Example : 461cabdff1aSopenharmony_ci * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70 462cabdff1aSopenharmony_ci * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71 463cabdff1aSopenharmony_ci * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72 464cabdff1aSopenharmony_ci * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73 465cabdff1aSopenharmony_ci * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74 466cabdff1aSopenharmony_ci * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75 467cabdff1aSopenharmony_ci * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76 468cabdff1aSopenharmony_ci * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77 469cabdff1aSopenharmony_ci * ============================================================================= 470cabdff1aSopenharmony_ci */ 471cabdff1aSopenharmony_ci#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 472cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 473cabdff1aSopenharmony_ci _out7) \ 474cabdff1aSopenharmony_ci { \ 475cabdff1aSopenharmony_ci __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ 476cabdff1aSopenharmony_ci \ 477cabdff1aSopenharmony_ci _s0 = __lsx_vilvl_h(_in6, _in4); \ 478cabdff1aSopenharmony_ci _s1 = __lsx_vilvl_h(_in7, _in5); \ 479cabdff1aSopenharmony_ci _t0 = __lsx_vilvl_h(_s1, _s0); \ 480cabdff1aSopenharmony_ci _t1 = __lsx_vilvh_h(_s1, _s0); \ 481cabdff1aSopenharmony_ci _s0 = __lsx_vilvh_h(_in6, _in4); \ 482cabdff1aSopenharmony_ci _s1 = __lsx_vilvh_h(_in7, _in5); \ 483cabdff1aSopenharmony_ci _t2 = __lsx_vilvl_h(_s1, _s0); \ 484cabdff1aSopenharmony_ci _t3 = __lsx_vilvh_h(_s1, _s0); \ 485cabdff1aSopenharmony_ci _s0 = __lsx_vilvl_h(_in2, _in0); \ 486cabdff1aSopenharmony_ci _s1 = __lsx_vilvl_h(_in3, _in1); \ 487cabdff1aSopenharmony_ci _t4 = __lsx_vilvl_h(_s1, _s0); \ 488cabdff1aSopenharmony_ci _t5 = __lsx_vilvh_h(_s1, _s0); \ 489cabdff1aSopenharmony_ci _s0 = __lsx_vilvh_h(_in2, _in0); \ 490cabdff1aSopenharmony_ci _s1 = __lsx_vilvh_h(_in3, _in1); \ 491cabdff1aSopenharmony_ci _t6 = __lsx_vilvl_h(_s1, _s0); \ 492cabdff1aSopenharmony_ci _t7 = __lsx_vilvh_h(_s1, _s0); \ 493cabdff1aSopenharmony_ci \ 494cabdff1aSopenharmony_ci _out0 = __lsx_vpickev_d(_t0, _t4); \ 495cabdff1aSopenharmony_ci _out2 = __lsx_vpickev_d(_t1, _t5); \ 496cabdff1aSopenharmony_ci _out4 = __lsx_vpickev_d(_t2, _t6); \ 497cabdff1aSopenharmony_ci _out6 = __lsx_vpickev_d(_t3, _t7); \ 498cabdff1aSopenharmony_ci _out1 = __lsx_vpickod_d(_t0, _t4); \ 499cabdff1aSopenharmony_ci _out3 = __lsx_vpickod_d(_t1, _t5); \ 500cabdff1aSopenharmony_ci _out5 = __lsx_vpickod_d(_t2, _t6); \ 501cabdff1aSopenharmony_ci _out7 = __lsx_vpickod_d(_t3, _t7); \ 502cabdff1aSopenharmony_ci } 503cabdff1aSopenharmony_ci 504cabdff1aSopenharmony_ci/* 505cabdff1aSopenharmony_ci * ============================================================================= 506cabdff1aSopenharmony_ci * Description : Transpose input 8x4 byte block into 4x8 507cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block) 508cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block) 509cabdff1aSopenharmony_ci * Return Type - as per RTYPE 510cabdff1aSopenharmony_ci * Details : The rows of the matrix become columns, and the columns become 511cabdff1aSopenharmony_ci * rows. 512cabdff1aSopenharmony_ci * Example : LSX_TRANSPOSE8x4_B 513cabdff1aSopenharmony_ci * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00 514cabdff1aSopenharmony_ci * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00 515cabdff1aSopenharmony_ci * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00 516cabdff1aSopenharmony_ci * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00 517cabdff1aSopenharmony_ci * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00 518cabdff1aSopenharmony_ci * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00 519cabdff1aSopenharmony_ci * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00 520cabdff1aSopenharmony_ci * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00 521cabdff1aSopenharmony_ci * 522cabdff1aSopenharmony_ci * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 523cabdff1aSopenharmony_ci * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 524cabdff1aSopenharmony_ci * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 525cabdff1aSopenharmony_ci * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 526cabdff1aSopenharmony_ci * ============================================================================= 527cabdff1aSopenharmony_ci */ 528cabdff1aSopenharmony_ci#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 529cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3) \ 530cabdff1aSopenharmony_ci { \ 531cabdff1aSopenharmony_ci __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 532cabdff1aSopenharmony_ci \ 533cabdff1aSopenharmony_ci _tmp0_m = __lsx_vpackev_w(_in4, _in0); \ 534cabdff1aSopenharmony_ci _tmp1_m = __lsx_vpackev_w(_in5, _in1); \ 535cabdff1aSopenharmony_ci _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ 536cabdff1aSopenharmony_ci _tmp0_m = __lsx_vpackev_w(_in6, _in2); \ 537cabdff1aSopenharmony_ci _tmp1_m = __lsx_vpackev_w(_in7, _in3); \ 538cabdff1aSopenharmony_ci \ 539cabdff1aSopenharmony_ci _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ 540cabdff1aSopenharmony_ci _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \ 541cabdff1aSopenharmony_ci _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \ 542cabdff1aSopenharmony_ci \ 543cabdff1aSopenharmony_ci _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \ 544cabdff1aSopenharmony_ci _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \ 545cabdff1aSopenharmony_ci _out1 = __lsx_vilvh_d(_out2, _out0); \ 546cabdff1aSopenharmony_ci _out3 = __lsx_vilvh_d(_out0, _out2); \ 547cabdff1aSopenharmony_ci } 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci/* 550cabdff1aSopenharmony_ci * ============================================================================= 551cabdff1aSopenharmony_ci * Description : Transpose 16x8 block with byte elements in vectors 552cabdff1aSopenharmony_ci * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8 553cabdff1aSopenharmony_ci * in9, in10, in11, in12, in13, in14, in15 554cabdff1aSopenharmony_ci * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 555cabdff1aSopenharmony_ci * Details : 556cabdff1aSopenharmony_ci * Example : 557cabdff1aSopenharmony_ci * 000,001,002,003,004,005,006,007 558cabdff1aSopenharmony_ci * 008,009,010,011,012,013,014,015 559cabdff1aSopenharmony_ci * 016,017,018,019,020,021,022,023 560cabdff1aSopenharmony_ci * 024,025,026,027,028,029,030,031 561cabdff1aSopenharmony_ci * 032,033,034,035,036,037,038,039 562cabdff1aSopenharmony_ci * 040,041,042,043,044,045,046,047 000,008,...,112,120 563cabdff1aSopenharmony_ci * 048,049,050,051,052,053,054,055 001,009,...,113,121 564cabdff1aSopenharmony_ci * 056,057,058,059,060,061,062,063 to 002,010,...,114,122 565cabdff1aSopenharmony_ci * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123 566cabdff1aSopenharmony_ci * 072,073,074,075,076,077,078,079 004,012,...,116,124 567cabdff1aSopenharmony_ci * 080,081,082,083,084,085,086,087 005,013,...,117,125 568cabdff1aSopenharmony_ci * 088,089,090,091,092,093,094,095 006,014,...,118,126 569cabdff1aSopenharmony_ci * 096,097,098,099,100,101,102,103 007,015,...,119,127 570cabdff1aSopenharmony_ci * 104,105,106,107,108,109,110,111 571cabdff1aSopenharmony_ci * 112,113,114,115,116,117,118,119 572cabdff1aSopenharmony_ci * 120,121,122,123,124,125,126,127 573cabdff1aSopenharmony_ci * ============================================================================= 574cabdff1aSopenharmony_ci */ 575cabdff1aSopenharmony_ci#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 576cabdff1aSopenharmony_ci _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ 577cabdff1aSopenharmony_ci _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ 578cabdff1aSopenharmony_ci _out6, _out7) \ 579cabdff1aSopenharmony_ci { \ 580cabdff1aSopenharmony_ci __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ 581cabdff1aSopenharmony_ci __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ 582cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \ 583cabdff1aSopenharmony_ci _tmp0, _tmp1, _tmp2, _tmp3); \ 584cabdff1aSopenharmony_ci DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \ 585cabdff1aSopenharmony_ci _in13, _tmp4, _tmp5, _tmp6, _tmp7); \ 586cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \ 587cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \ 588cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \ 589cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \ 590cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \ 591cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \ 592cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \ 593cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \ 594cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \ 595cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \ 596cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \ 597cabdff1aSopenharmony_ci DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \ 598cabdff1aSopenharmony_ci } 599cabdff1aSopenharmony_ci 600cabdff1aSopenharmony_ci/* 601cabdff1aSopenharmony_ci * ============================================================================= 602cabdff1aSopenharmony_ci * Description : Butterfly of 4 input vectors 603cabdff1aSopenharmony_ci * Arguments : Inputs - in0, in1, in2, in3 604cabdff1aSopenharmony_ci * Outputs - out0, out1, out2, out3 605cabdff1aSopenharmony_ci * Details : Butterfly operation 606cabdff1aSopenharmony_ci * Example : 607cabdff1aSopenharmony_ci * out0 = in0 + in3; 608cabdff1aSopenharmony_ci * out1 = in1 + in2; 609cabdff1aSopenharmony_ci * out2 = in1 - in2; 610cabdff1aSopenharmony_ci * out3 = in0 - in3; 611cabdff1aSopenharmony_ci * ============================================================================= 612cabdff1aSopenharmony_ci */ 613cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 614cabdff1aSopenharmony_ci { \ 615cabdff1aSopenharmony_ci _out0 = __lsx_vadd_b(_in0, _in3); \ 616cabdff1aSopenharmony_ci _out1 = __lsx_vadd_b(_in1, _in2); \ 617cabdff1aSopenharmony_ci _out2 = __lsx_vsub_b(_in1, _in2); \ 618cabdff1aSopenharmony_ci _out3 = __lsx_vsub_b(_in0, _in3); \ 619cabdff1aSopenharmony_ci } 620cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 621cabdff1aSopenharmony_ci { \ 622cabdff1aSopenharmony_ci _out0 = __lsx_vadd_h(_in0, _in3); \ 623cabdff1aSopenharmony_ci _out1 = __lsx_vadd_h(_in1, _in2); \ 624cabdff1aSopenharmony_ci _out2 = __lsx_vsub_h(_in1, _in2); \ 625cabdff1aSopenharmony_ci _out3 = __lsx_vsub_h(_in0, _in3); \ 626cabdff1aSopenharmony_ci } 627cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 628cabdff1aSopenharmony_ci { \ 629cabdff1aSopenharmony_ci _out0 = __lsx_vadd_w(_in0, _in3); \ 630cabdff1aSopenharmony_ci _out1 = __lsx_vadd_w(_in1, _in2); \ 631cabdff1aSopenharmony_ci _out2 = __lsx_vsub_w(_in1, _in2); \ 632cabdff1aSopenharmony_ci _out3 = __lsx_vsub_w(_in0, _in3); \ 633cabdff1aSopenharmony_ci } 634cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 635cabdff1aSopenharmony_ci { \ 636cabdff1aSopenharmony_ci _out0 = __lsx_vadd_d(_in0, _in3); \ 637cabdff1aSopenharmony_ci _out1 = __lsx_vadd_d(_in1, _in2); \ 638cabdff1aSopenharmony_ci _out2 = __lsx_vsub_d(_in1, _in2); \ 639cabdff1aSopenharmony_ci _out3 = __lsx_vsub_d(_in0, _in3); \ 640cabdff1aSopenharmony_ci } 641cabdff1aSopenharmony_ci 642cabdff1aSopenharmony_ci/* 643cabdff1aSopenharmony_ci * ============================================================================= 644cabdff1aSopenharmony_ci * Description : Butterfly of 8 input vectors 645cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ 646cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3, ~ 647cabdff1aSopenharmony_ci * Details : Butterfly operation 648cabdff1aSopenharmony_ci * Example : 649cabdff1aSopenharmony_ci * _out0 = _in0 + _in7; 650cabdff1aSopenharmony_ci * _out1 = _in1 + _in6; 651cabdff1aSopenharmony_ci * _out2 = _in2 + _in5; 652cabdff1aSopenharmony_ci * _out3 = _in3 + _in4; 653cabdff1aSopenharmony_ci * _out4 = _in3 - _in4; 654cabdff1aSopenharmony_ci * _out5 = _in2 - _in5; 655cabdff1aSopenharmony_ci * _out6 = _in1 - _in6; 656cabdff1aSopenharmony_ci * _out7 = _in0 - _in7; 657cabdff1aSopenharmony_ci * ============================================================================= 658cabdff1aSopenharmony_ci */ 659cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 660cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 661cabdff1aSopenharmony_ci _out7) \ 662cabdff1aSopenharmony_ci { \ 663cabdff1aSopenharmony_ci _out0 = __lsx_vadd_b(_in0, _in7); \ 664cabdff1aSopenharmony_ci _out1 = __lsx_vadd_b(_in1, _in6); \ 665cabdff1aSopenharmony_ci _out2 = __lsx_vadd_b(_in2, _in5); \ 666cabdff1aSopenharmony_ci _out3 = __lsx_vadd_b(_in3, _in4); \ 667cabdff1aSopenharmony_ci _out4 = __lsx_vsub_b(_in3, _in4); \ 668cabdff1aSopenharmony_ci _out5 = __lsx_vsub_b(_in2, _in5); \ 669cabdff1aSopenharmony_ci _out6 = __lsx_vsub_b(_in1, _in6); \ 670cabdff1aSopenharmony_ci _out7 = __lsx_vsub_b(_in0, _in7); \ 671cabdff1aSopenharmony_ci } 672cabdff1aSopenharmony_ci 673cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 674cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 675cabdff1aSopenharmony_ci _out7) \ 676cabdff1aSopenharmony_ci { \ 677cabdff1aSopenharmony_ci _out0 = __lsx_vadd_h(_in0, _in7); \ 678cabdff1aSopenharmony_ci _out1 = __lsx_vadd_h(_in1, _in6); \ 679cabdff1aSopenharmony_ci _out2 = __lsx_vadd_h(_in2, _in5); \ 680cabdff1aSopenharmony_ci _out3 = __lsx_vadd_h(_in3, _in4); \ 681cabdff1aSopenharmony_ci _out4 = __lsx_vsub_h(_in3, _in4); \ 682cabdff1aSopenharmony_ci _out5 = __lsx_vsub_h(_in2, _in5); \ 683cabdff1aSopenharmony_ci _out6 = __lsx_vsub_h(_in1, _in6); \ 684cabdff1aSopenharmony_ci _out7 = __lsx_vsub_h(_in0, _in7); \ 685cabdff1aSopenharmony_ci } 686cabdff1aSopenharmony_ci 687cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 688cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 689cabdff1aSopenharmony_ci _out7) \ 690cabdff1aSopenharmony_ci { \ 691cabdff1aSopenharmony_ci _out0 = __lsx_vadd_w(_in0, _in7); \ 692cabdff1aSopenharmony_ci _out1 = __lsx_vadd_w(_in1, _in6); \ 693cabdff1aSopenharmony_ci _out2 = __lsx_vadd_w(_in2, _in5); \ 694cabdff1aSopenharmony_ci _out3 = __lsx_vadd_w(_in3, _in4); \ 695cabdff1aSopenharmony_ci _out4 = __lsx_vsub_w(_in3, _in4); \ 696cabdff1aSopenharmony_ci _out5 = __lsx_vsub_w(_in2, _in5); \ 697cabdff1aSopenharmony_ci _out6 = __lsx_vsub_w(_in1, _in6); \ 698cabdff1aSopenharmony_ci _out7 = __lsx_vsub_w(_in0, _in7); \ 699cabdff1aSopenharmony_ci } 700cabdff1aSopenharmony_ci 701cabdff1aSopenharmony_ci#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 702cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 703cabdff1aSopenharmony_ci _out7) \ 704cabdff1aSopenharmony_ci { \ 705cabdff1aSopenharmony_ci _out0 = __lsx_vadd_d(_in0, _in7); \ 706cabdff1aSopenharmony_ci _out1 = __lsx_vadd_d(_in1, _in6); \ 707cabdff1aSopenharmony_ci _out2 = __lsx_vadd_d(_in2, _in5); \ 708cabdff1aSopenharmony_ci _out3 = __lsx_vadd_d(_in3, _in4); \ 709cabdff1aSopenharmony_ci _out4 = __lsx_vsub_d(_in3, _in4); \ 710cabdff1aSopenharmony_ci _out5 = __lsx_vsub_d(_in2, _in5); \ 711cabdff1aSopenharmony_ci _out6 = __lsx_vsub_d(_in1, _in6); \ 712cabdff1aSopenharmony_ci _out7 = __lsx_vsub_d(_in0, _in7); \ 713cabdff1aSopenharmony_ci } 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci#endif // LSX 716cabdff1aSopenharmony_ci 717cabdff1aSopenharmony_ci#ifdef __loongarch_asx 718cabdff1aSopenharmony_ci#include <lasxintrin.h> 719cabdff1aSopenharmony_ci/* 720cabdff1aSopenharmony_ci * ============================================================================= 721cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements 722cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 723cabdff1aSopenharmony_ci * Output - out 724cabdff1aSopenharmony_ci * Return Type - signed halfword 725cabdff1aSopenharmony_ci * Details : Unsigned byte elements from in_h are multiplied with 726cabdff1aSopenharmony_ci * unsigned byte elements from in_l producing a result 727cabdff1aSopenharmony_ci * twice the size of input i.e. signed halfword. 728cabdff1aSopenharmony_ci * Then this multiplied results of adjacent odd-even elements 729cabdff1aSopenharmony_ci * are added to the out vector 730cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) 731cabdff1aSopenharmony_ci * ============================================================================= 732cabdff1aSopenharmony_ci */ 733cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { 734cabdff1aSopenharmony_ci __m256i out; 735cabdff1aSopenharmony_ci 736cabdff1aSopenharmony_ci out = __lasx_xvmulwev_h_bu(in_h, in_l); 737cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); 738cabdff1aSopenharmony_ci return out; 739cabdff1aSopenharmony_ci} 740cabdff1aSopenharmony_ci 741cabdff1aSopenharmony_ci/* 742cabdff1aSopenharmony_ci * ============================================================================= 743cabdff1aSopenharmony_ci * Description : Dot product of byte vector elements 744cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 745cabdff1aSopenharmony_ci * Output - out 746cabdff1aSopenharmony_ci * Return Type - signed halfword 747cabdff1aSopenharmony_ci * Details : Signed byte elements from in_h are multiplied with 748cabdff1aSopenharmony_ci * signed byte elements from in_l producing a result 749cabdff1aSopenharmony_ci * twice the size of input i.e. signed halfword. 750cabdff1aSopenharmony_ci * Then this multiplication results of adjacent odd-even elements 751cabdff1aSopenharmony_ci * are added to the out vector 752cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) 753cabdff1aSopenharmony_ci * ============================================================================= 754cabdff1aSopenharmony_ci */ 755cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) { 756cabdff1aSopenharmony_ci __m256i out; 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci out = __lasx_xvmulwev_h_b(in_h, in_l); 759cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_h_b(out, in_h, in_l); 760cabdff1aSopenharmony_ci return out; 761cabdff1aSopenharmony_ci} 762cabdff1aSopenharmony_ci 763cabdff1aSopenharmony_ci/* 764cabdff1aSopenharmony_ci * ============================================================================= 765cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements 766cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 767cabdff1aSopenharmony_ci * Output - out 768cabdff1aSopenharmony_ci * Return Type - signed word 769cabdff1aSopenharmony_ci * Details : Signed halfword elements from in_h are multiplied with 770cabdff1aSopenharmony_ci * signed halfword elements from in_l producing a result 771cabdff1aSopenharmony_ci * twice the size of input i.e. signed word. 772cabdff1aSopenharmony_ci * Then this multiplied results of adjacent odd-even elements 773cabdff1aSopenharmony_ci * are added to the out vector. 774cabdff1aSopenharmony_ci * Example : out = __lasx_xvdp2_w_h(in_h, in_l) 775cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 776cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 777cabdff1aSopenharmony_ci * out : 22,38,38,22, 22,38,38,22 778cabdff1aSopenharmony_ci * ============================================================================= 779cabdff1aSopenharmony_ci */ 780cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { 781cabdff1aSopenharmony_ci __m256i out; 782cabdff1aSopenharmony_ci 783cabdff1aSopenharmony_ci out = __lasx_xvmulwev_w_h(in_h, in_l); 784cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_w_h(out, in_h, in_l); 785cabdff1aSopenharmony_ci return out; 786cabdff1aSopenharmony_ci} 787cabdff1aSopenharmony_ci 788cabdff1aSopenharmony_ci/* 789cabdff1aSopenharmony_ci * ============================================================================= 790cabdff1aSopenharmony_ci * Description : Dot product of word vector elements 791cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 792cabdff1aSopenharmony_ci * Output - out 793cabdff1aSopenharmony_ci * Return Type - signed double 794cabdff1aSopenharmony_ci * Details : Signed word elements from in_h are multiplied with 795cabdff1aSopenharmony_ci * signed word elements from in_l producing a result 796cabdff1aSopenharmony_ci * twice the size of input i.e. signed double-word. 797cabdff1aSopenharmony_ci * Then this multiplied results of adjacent odd-even elements 798cabdff1aSopenharmony_ci * are added to the out vector. 799cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) 800cabdff1aSopenharmony_ci * ============================================================================= 801cabdff1aSopenharmony_ci */ 802cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) { 803cabdff1aSopenharmony_ci __m256i out; 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci out = __lasx_xvmulwev_d_w(in_h, in_l); 806cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_d_w(out, in_h, in_l); 807cabdff1aSopenharmony_ci return out; 808cabdff1aSopenharmony_ci} 809cabdff1aSopenharmony_ci 810cabdff1aSopenharmony_ci/* 811cabdff1aSopenharmony_ci * ============================================================================= 812cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements 813cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 814cabdff1aSopenharmony_ci * Output - out 815cabdff1aSopenharmony_ci * Return Type - signed word 816cabdff1aSopenharmony_ci * Details : Unsigned halfword elements from in_h are multiplied with 817cabdff1aSopenharmony_ci * signed halfword elements from in_l producing a result 818cabdff1aSopenharmony_ci * twice the size of input i.e. unsigned word. 819cabdff1aSopenharmony_ci * Multiplication result of adjacent odd-even elements 820cabdff1aSopenharmony_ci * are added to the out vector 821cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) 822cabdff1aSopenharmony_ci * ============================================================================= 823cabdff1aSopenharmony_ci */ 824cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { 825cabdff1aSopenharmony_ci __m256i out; 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_ci out = __lasx_xvmulwev_w_hu_h(in_h, in_l); 828cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); 829cabdff1aSopenharmony_ci return out; 830cabdff1aSopenharmony_ci} 831cabdff1aSopenharmony_ci 832cabdff1aSopenharmony_ci/* 833cabdff1aSopenharmony_ci * ============================================================================= 834cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements 835cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 836cabdff1aSopenharmony_ci * Output - out 837cabdff1aSopenharmony_ci * Return Type - halfword 838cabdff1aSopenharmony_ci * Details : Signed byte elements from in_h are multiplied with 839cabdff1aSopenharmony_ci * signed byte elements from in_l producing a result 840cabdff1aSopenharmony_ci * twice the size of input i.e. signed halfword. 841cabdff1aSopenharmony_ci * Then this multiplied results of adjacent odd-even elements 842cabdff1aSopenharmony_ci * are added to the in_c vector. 843cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 844cabdff1aSopenharmony_ci * ============================================================================= 845cabdff1aSopenharmony_ci */ 846cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, 847cabdff1aSopenharmony_ci __m256i in_l) { 848cabdff1aSopenharmony_ci __m256i out; 849cabdff1aSopenharmony_ci 850cabdff1aSopenharmony_ci out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l); 851cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_h_b(out, in_h, in_l); 852cabdff1aSopenharmony_ci return out; 853cabdff1aSopenharmony_ci} 854cabdff1aSopenharmony_ci 855cabdff1aSopenharmony_ci/* 856cabdff1aSopenharmony_ci * ============================================================================= 857cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements 858cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 859cabdff1aSopenharmony_ci * Output - out 860cabdff1aSopenharmony_ci * Return Type - halfword 861cabdff1aSopenharmony_ci * Details : Unsigned byte elements from in_h are multiplied with 862cabdff1aSopenharmony_ci * unsigned byte elements from in_l producing a result 863cabdff1aSopenharmony_ci * twice the size of input i.e. signed halfword. 864cabdff1aSopenharmony_ci * Then this multiplied results of adjacent odd-even elements 865cabdff1aSopenharmony_ci * are added to the in_c vector. 866cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 867cabdff1aSopenharmony_ci * ============================================================================= 868cabdff1aSopenharmony_ci */ 869cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, 870cabdff1aSopenharmony_ci __m256i in_l) { 871cabdff1aSopenharmony_ci __m256i out; 872cabdff1aSopenharmony_ci 873cabdff1aSopenharmony_ci out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l); 874cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); 875cabdff1aSopenharmony_ci return out; 876cabdff1aSopenharmony_ci} 877cabdff1aSopenharmony_ci 878cabdff1aSopenharmony_ci/* 879cabdff1aSopenharmony_ci * ============================================================================= 880cabdff1aSopenharmony_ci * Description : Dot product & addition of byte vector elements 881cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 882cabdff1aSopenharmony_ci * Output - out 883cabdff1aSopenharmony_ci * Return Type - halfword 884cabdff1aSopenharmony_ci * Details : Unsigned byte elements from in_h are multiplied with 885cabdff1aSopenharmony_ci * signed byte elements from in_l producing a result 886cabdff1aSopenharmony_ci * twice the size of input i.e. signed halfword. 887cabdff1aSopenharmony_ci * Then this multiplied results of adjacent odd-even elements 888cabdff1aSopenharmony_ci * are added to the in_c vector. 889cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 890cabdff1aSopenharmony_ci * ============================================================================= 891cabdff1aSopenharmony_ci */ 892cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, 893cabdff1aSopenharmony_ci __m256i in_l) { 894cabdff1aSopenharmony_ci __m256i out; 895cabdff1aSopenharmony_ci 896cabdff1aSopenharmony_ci out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l); 897cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l); 898cabdff1aSopenharmony_ci return out; 899cabdff1aSopenharmony_ci} 900cabdff1aSopenharmony_ci 901cabdff1aSopenharmony_ci/* 902cabdff1aSopenharmony_ci * ============================================================================= 903cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements 904cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 905cabdff1aSopenharmony_ci * Output - out 906cabdff1aSopenharmony_ci * Return Type - per RTYPE 907cabdff1aSopenharmony_ci * Details : Signed halfword elements from in_h are multiplied with 908cabdff1aSopenharmony_ci * signed halfword elements from in_l producing a result 909cabdff1aSopenharmony_ci * twice the size of input i.e. signed word. 910cabdff1aSopenharmony_ci * Multiplication result of adjacent odd-even elements 911cabdff1aSopenharmony_ci * are added to the in_c vector. 912cabdff1aSopenharmony_ci * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 913cabdff1aSopenharmony_ci * in_c : 1,2,3,4, 1,2,3,4 914cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8, 915cabdff1aSopenharmony_ci * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1, 916cabdff1aSopenharmony_ci * out : 23,40,41,26, 23,40,41,26 917cabdff1aSopenharmony_ci * ============================================================================= 918cabdff1aSopenharmony_ci */ 919cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, 920cabdff1aSopenharmony_ci __m256i in_l) { 921cabdff1aSopenharmony_ci __m256i out; 922cabdff1aSopenharmony_ci 923cabdff1aSopenharmony_ci out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l); 924cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_w_h(out, in_h, in_l); 925cabdff1aSopenharmony_ci return out; 926cabdff1aSopenharmony_ci} 927cabdff1aSopenharmony_ci 928cabdff1aSopenharmony_ci/* 929cabdff1aSopenharmony_ci * ============================================================================= 930cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements 931cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 932cabdff1aSopenharmony_ci * Output - out 933cabdff1aSopenharmony_ci * Return Type - signed word 934cabdff1aSopenharmony_ci * Details : Unsigned halfword elements from in_h are multiplied with 935cabdff1aSopenharmony_ci * unsigned halfword elements from in_l producing a result 936cabdff1aSopenharmony_ci * twice the size of input i.e. signed word. 937cabdff1aSopenharmony_ci * Multiplication result of adjacent odd-even elements 938cabdff1aSopenharmony_ci * are added to the in_c vector. 939cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 940cabdff1aSopenharmony_ci * ============================================================================= 941cabdff1aSopenharmony_ci */ 942cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, 943cabdff1aSopenharmony_ci __m256i in_l) { 944cabdff1aSopenharmony_ci __m256i out; 945cabdff1aSopenharmony_ci 946cabdff1aSopenharmony_ci out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l); 947cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_w_hu(out, in_h, in_l); 948cabdff1aSopenharmony_ci return out; 949cabdff1aSopenharmony_ci} 950cabdff1aSopenharmony_ci 951cabdff1aSopenharmony_ci/* 952cabdff1aSopenharmony_ci * ============================================================================= 953cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements 954cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 955cabdff1aSopenharmony_ci * Output - out 956cabdff1aSopenharmony_ci * Return Type - signed word 957cabdff1aSopenharmony_ci * Details : Unsigned halfword elements from in_h are multiplied with 958cabdff1aSopenharmony_ci * signed halfword elements from in_l producing a result 959cabdff1aSopenharmony_ci * twice the size of input i.e. signed word. 960cabdff1aSopenharmony_ci * Multiplication result of adjacent odd-even elements 961cabdff1aSopenharmony_ci * are added to the in_c vector 962cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) 963cabdff1aSopenharmony_ci * ============================================================================= 964cabdff1aSopenharmony_ci */ 965cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, 966cabdff1aSopenharmony_ci __m256i in_l) { 967cabdff1aSopenharmony_ci __m256i out; 968cabdff1aSopenharmony_ci 969cabdff1aSopenharmony_ci out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l); 970cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); 971cabdff1aSopenharmony_ci return out; 972cabdff1aSopenharmony_ci} 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_ci/* 975cabdff1aSopenharmony_ci * ============================================================================= 976cabdff1aSopenharmony_ci * Description : Vector Unsigned Dot Product and Subtract 977cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 978cabdff1aSopenharmony_ci * Output - out 979cabdff1aSopenharmony_ci * Return Type - signed halfword 980cabdff1aSopenharmony_ci * Details : Unsigned byte elements from in_h are multiplied with 981cabdff1aSopenharmony_ci * unsigned byte elements from in_l producing a result 982cabdff1aSopenharmony_ci * twice the size of input i.e. signed halfword. 983cabdff1aSopenharmony_ci * Multiplication result of adjacent odd-even elements 984cabdff1aSopenharmony_ci * are added together and subtracted from double width elements 985cabdff1aSopenharmony_ci * in_c vector. 986cabdff1aSopenharmony_ci * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) 987cabdff1aSopenharmony_ci * ============================================================================= 988cabdff1aSopenharmony_ci */ 989cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, 990cabdff1aSopenharmony_ci __m256i in_l) { 991cabdff1aSopenharmony_ci __m256i out; 992cabdff1aSopenharmony_ci 993cabdff1aSopenharmony_ci out = __lasx_xvmulwev_h_bu(in_h, in_l); 994cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); 995cabdff1aSopenharmony_ci out = __lasx_xvsub_h(in_c, out); 996cabdff1aSopenharmony_ci return out; 997cabdff1aSopenharmony_ci} 998cabdff1aSopenharmony_ci 999cabdff1aSopenharmony_ci/* 1000cabdff1aSopenharmony_ci * ============================================================================= 1001cabdff1aSopenharmony_ci * Description : Vector Signed Dot Product and Subtract 1002cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 1003cabdff1aSopenharmony_ci * Output - out 1004cabdff1aSopenharmony_ci * Return Type - signed word 1005cabdff1aSopenharmony_ci * Details : Signed halfword elements from in_h are multiplied with 1006cabdff1aSopenharmony_ci * Signed halfword elements from in_l producing a result 1007cabdff1aSopenharmony_ci * twice the size of input i.e. signed word. 1008cabdff1aSopenharmony_ci * Multiplication result of adjacent odd-even elements 1009cabdff1aSopenharmony_ci * are added together and subtracted from double width elements 1010cabdff1aSopenharmony_ci * in_c vector. 1011cabdff1aSopenharmony_ci * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) 1012cabdff1aSopenharmony_ci * in_c : 0,0,0,0, 0,0,0,0 1013cabdff1aSopenharmony_ci * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1 1014cabdff1aSopenharmony_ci * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1 1015cabdff1aSopenharmony_ci * out : -7,-3,0,0, 0,-1,0,-1 1016cabdff1aSopenharmony_ci * ============================================================================= 1017cabdff1aSopenharmony_ci */ 1018cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, 1019cabdff1aSopenharmony_ci __m256i in_l) { 1020cabdff1aSopenharmony_ci __m256i out; 1021cabdff1aSopenharmony_ci 1022cabdff1aSopenharmony_ci out = __lasx_xvmulwev_w_h(in_h, in_l); 1023cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_w_h(out, in_h, in_l); 1024cabdff1aSopenharmony_ci out = __lasx_xvsub_w(in_c, out); 1025cabdff1aSopenharmony_ci return out; 1026cabdff1aSopenharmony_ci} 1027cabdff1aSopenharmony_ci 1028cabdff1aSopenharmony_ci/* 1029cabdff1aSopenharmony_ci * ============================================================================= 1030cabdff1aSopenharmony_ci * Description : Dot product of halfword vector elements 1031cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1032cabdff1aSopenharmony_ci * Output - out 1033cabdff1aSopenharmony_ci * Return Type - signed word 1034cabdff1aSopenharmony_ci * Details : Signed halfword elements from in_h are multiplied with 1035cabdff1aSopenharmony_ci * signed halfword elements from in_l producing a result 1036cabdff1aSopenharmony_ci * four times the size of input i.e. signed doubleword. 1037cabdff1aSopenharmony_ci * Then this multiplication results of four adjacent elements 1038cabdff1aSopenharmony_ci * are added together and stored to the out vector. 1039cabdff1aSopenharmony_ci * Example : out = __lasx_xvdp4_d_h(in_h, in_l) 1040cabdff1aSopenharmony_ci * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1 1041cabdff1aSopenharmony_ci * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1 1042cabdff1aSopenharmony_ci * out : -2,0,1,1 1043cabdff1aSopenharmony_ci * ============================================================================= 1044cabdff1aSopenharmony_ci */ 1045cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) { 1046cabdff1aSopenharmony_ci __m256i out; 1047cabdff1aSopenharmony_ci 1048cabdff1aSopenharmony_ci out = __lasx_xvmulwev_w_h(in_h, in_l); 1049cabdff1aSopenharmony_ci out = __lasx_xvmaddwod_w_h(out, in_h, in_l); 1050cabdff1aSopenharmony_ci out = __lasx_xvhaddw_d_w(out, out); 1051cabdff1aSopenharmony_ci return out; 1052cabdff1aSopenharmony_ci} 1053cabdff1aSopenharmony_ci 1054cabdff1aSopenharmony_ci/* 1055cabdff1aSopenharmony_ci * ============================================================================= 1056cabdff1aSopenharmony_ci * Description : The high half of the vector elements are expanded and 1057cabdff1aSopenharmony_ci * added after being doubled. 1058cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1059cabdff1aSopenharmony_ci * Output - out 1060cabdff1aSopenharmony_ci * Details : The in_h vector and the in_l vector are added after the 1061cabdff1aSopenharmony_ci * higher half of the two-fold sign extension (signed byte 1062cabdff1aSopenharmony_ci * to signed halfword) and stored to the out vector. 1063cabdff1aSopenharmony_ci * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l) 1064cabdff1aSopenharmony_ci * ============================================================================= 1065cabdff1aSopenharmony_ci */ 1066cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) { 1067cabdff1aSopenharmony_ci __m256i out; 1068cabdff1aSopenharmony_ci 1069cabdff1aSopenharmony_ci out = __lasx_xvilvh_b(in_h, in_l); 1070cabdff1aSopenharmony_ci out = __lasx_xvhaddw_h_b(out, out); 1071cabdff1aSopenharmony_ci return out; 1072cabdff1aSopenharmony_ci} 1073cabdff1aSopenharmony_ci 1074cabdff1aSopenharmony_ci/* 1075cabdff1aSopenharmony_ci * ============================================================================= 1076cabdff1aSopenharmony_ci * Description : The high half of the vector elements are expanded and 1077cabdff1aSopenharmony_ci * added after being doubled. 1078cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1079cabdff1aSopenharmony_ci * Output - out 1080cabdff1aSopenharmony_ci * Details : The in_h vector and the in_l vector are added after the 1081cabdff1aSopenharmony_ci * higher half of the two-fold sign extension (signed halfword 1082cabdff1aSopenharmony_ci * to signed word) and stored to the out vector. 1083cabdff1aSopenharmony_ci * Example : out = __lasx_xvaddwh_w_h(in_h, in_l) 1084cabdff1aSopenharmony_ci * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 1085cabdff1aSopenharmony_ci * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 1086cabdff1aSopenharmony_ci * out : 1,0,0,-1, 1,0,0, 2 1087cabdff1aSopenharmony_ci * ============================================================================= 1088cabdff1aSopenharmony_ci */ 1089cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) { 1090cabdff1aSopenharmony_ci __m256i out; 1091cabdff1aSopenharmony_ci 1092cabdff1aSopenharmony_ci out = __lasx_xvilvh_h(in_h, in_l); 1093cabdff1aSopenharmony_ci out = __lasx_xvhaddw_w_h(out, out); 1094cabdff1aSopenharmony_ci return out; 1095cabdff1aSopenharmony_ci} 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci/* 1098cabdff1aSopenharmony_ci * ============================================================================= 1099cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and 1100cabdff1aSopenharmony_ci * added after being doubled. 1101cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1102cabdff1aSopenharmony_ci * Output - out 1103cabdff1aSopenharmony_ci * Details : The in_h vector and the in_l vector are added after the 1104cabdff1aSopenharmony_ci * lower half of the two-fold sign extension (signed byte 1105cabdff1aSopenharmony_ci * to signed halfword) and stored to the out vector. 1106cabdff1aSopenharmony_ci * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) 1107cabdff1aSopenharmony_ci * ============================================================================= 1108cabdff1aSopenharmony_ci */ 1109cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) { 1110cabdff1aSopenharmony_ci __m256i out; 1111cabdff1aSopenharmony_ci 1112cabdff1aSopenharmony_ci out = __lasx_xvilvl_b(in_h, in_l); 1113cabdff1aSopenharmony_ci out = __lasx_xvhaddw_h_b(out, out); 1114cabdff1aSopenharmony_ci return out; 1115cabdff1aSopenharmony_ci} 1116cabdff1aSopenharmony_ci 1117cabdff1aSopenharmony_ci/* 1118cabdff1aSopenharmony_ci * ============================================================================= 1119cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and 1120cabdff1aSopenharmony_ci * added after being doubled. 1121cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1122cabdff1aSopenharmony_ci * Output - out 1123cabdff1aSopenharmony_ci * Details : The in_h vector and the in_l vector are added after the 1124cabdff1aSopenharmony_ci * lower half of the two-fold sign extension (signed halfword 1125cabdff1aSopenharmony_ci * to signed word) and stored to the out vector. 1126cabdff1aSopenharmony_ci * Example : out = __lasx_xvaddwl_w_h(in_h, in_l) 1127cabdff1aSopenharmony_ci * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 1128cabdff1aSopenharmony_ci * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 1129cabdff1aSopenharmony_ci * out : 5,-1,4,2, 1,0,2,-1 1130cabdff1aSopenharmony_ci * ============================================================================= 1131cabdff1aSopenharmony_ci */ 1132cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) { 1133cabdff1aSopenharmony_ci __m256i out; 1134cabdff1aSopenharmony_ci 1135cabdff1aSopenharmony_ci out = __lasx_xvilvl_h(in_h, in_l); 1136cabdff1aSopenharmony_ci out = __lasx_xvhaddw_w_h(out, out); 1137cabdff1aSopenharmony_ci return out; 1138cabdff1aSopenharmony_ci} 1139cabdff1aSopenharmony_ci 1140cabdff1aSopenharmony_ci/* 1141cabdff1aSopenharmony_ci * ============================================================================= 1142cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and 1143cabdff1aSopenharmony_ci * added after being doubled. 1144cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1145cabdff1aSopenharmony_ci * Output - out 1146cabdff1aSopenharmony_ci * Details : The out vector and the out vector are added after the 1147cabdff1aSopenharmony_ci * lower half of the two-fold zero extension (unsigned byte 1148cabdff1aSopenharmony_ci * to unsigned halfword) and stored to the out vector. 1149cabdff1aSopenharmony_ci * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) 1150cabdff1aSopenharmony_ci * ============================================================================= 1151cabdff1aSopenharmony_ci */ 1152cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) { 1153cabdff1aSopenharmony_ci __m256i out; 1154cabdff1aSopenharmony_ci 1155cabdff1aSopenharmony_ci out = __lasx_xvilvl_b(in_h, in_l); 1156cabdff1aSopenharmony_ci out = __lasx_xvhaddw_hu_bu(out, out); 1157cabdff1aSopenharmony_ci return out; 1158cabdff1aSopenharmony_ci} 1159cabdff1aSopenharmony_ci 1160cabdff1aSopenharmony_ci/* 1161cabdff1aSopenharmony_ci * ============================================================================= 1162cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and 1163cabdff1aSopenharmony_ci * added after being doubled. 1164cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1165cabdff1aSopenharmony_ci * Output - out 1166cabdff1aSopenharmony_ci * Details : The in_l vector after double zero extension (unsigned byte to 1167cabdff1aSopenharmony_ci * signed halfword),added to the in_h vector. 1168cabdff1aSopenharmony_ci * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l) 1169cabdff1aSopenharmony_ci * ============================================================================= 1170cabdff1aSopenharmony_ci */ 1171cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) { 1172cabdff1aSopenharmony_ci __m256i out; 1173cabdff1aSopenharmony_ci 1174cabdff1aSopenharmony_ci out = __lasx_xvsllwil_hu_bu(in_l, 0); 1175cabdff1aSopenharmony_ci out = __lasx_xvadd_h(in_h, out); 1176cabdff1aSopenharmony_ci return out; 1177cabdff1aSopenharmony_ci} 1178cabdff1aSopenharmony_ci 1179cabdff1aSopenharmony_ci/* 1180cabdff1aSopenharmony_ci * ============================================================================= 1181cabdff1aSopenharmony_ci * Description : The low half of the vector elements are expanded and 1182cabdff1aSopenharmony_ci * added after being doubled. 1183cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1184cabdff1aSopenharmony_ci * Output - out 1185cabdff1aSopenharmony_ci * Details : The in_l vector after double sign extension (signed halfword to 1186cabdff1aSopenharmony_ci * signed word), added to the in_h vector. 1187cabdff1aSopenharmony_ci * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l) 1188cabdff1aSopenharmony_ci * in_h : 0, 1,0,0, -1,0,0,1, 1189cabdff1aSopenharmony_ci * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1, 1190cabdff1aSopenharmony_ci * out : 2, 0,1,2, -1,0,1,1, 1191cabdff1aSopenharmony_ci * ============================================================================= 1192cabdff1aSopenharmony_ci */ 1193cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { 1194cabdff1aSopenharmony_ci __m256i out; 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci out = __lasx_xvsllwil_w_h(in_l, 0); 1197cabdff1aSopenharmony_ci out = __lasx_xvadd_w(in_h, out); 1198cabdff1aSopenharmony_ci return out; 1199cabdff1aSopenharmony_ci} 1200cabdff1aSopenharmony_ci 1201cabdff1aSopenharmony_ci/* 1202cabdff1aSopenharmony_ci * ============================================================================= 1203cabdff1aSopenharmony_ci * Description : Multiplication and addition calculation after expansion 1204cabdff1aSopenharmony_ci * of the lower half of the vector. 1205cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 1206cabdff1aSopenharmony_ci * Output - out 1207cabdff1aSopenharmony_ci * Details : The in_h vector and the in_l vector are multiplied after 1208cabdff1aSopenharmony_ci * the lower half of the two-fold sign extension (signed halfword 1209cabdff1aSopenharmony_ci * to signed word), and the result is added to the vector in_c, 1210cabdff1aSopenharmony_ci * then stored to the out vector. 1211cabdff1aSopenharmony_ci * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) 1212cabdff1aSopenharmony_ci * in_c : 1,2,3,4, 5,6,7,8 1213cabdff1aSopenharmony_ci * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8 1214cabdff1aSopenharmony_ci * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000, 1215cabdff1aSopenharmony_ci * -200,-300,-400,-500, -2000,-3000,-4000,-5000 1216cabdff1aSopenharmony_ci * out : 201, 602,1203,2004, -995, -1794,-2793,-3992 1217cabdff1aSopenharmony_ci * ============================================================================= 1218cabdff1aSopenharmony_ci */ 1219cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, 1220cabdff1aSopenharmony_ci __m256i in_l) { 1221cabdff1aSopenharmony_ci __m256i tmp0, tmp1, out; 1222cabdff1aSopenharmony_ci 1223cabdff1aSopenharmony_ci tmp0 = __lasx_xvsllwil_w_h(in_h, 0); 1224cabdff1aSopenharmony_ci tmp1 = __lasx_xvsllwil_w_h(in_l, 0); 1225cabdff1aSopenharmony_ci tmp0 = __lasx_xvmul_w(tmp0, tmp1); 1226cabdff1aSopenharmony_ci out = __lasx_xvadd_w(tmp0, in_c); 1227cabdff1aSopenharmony_ci return out; 1228cabdff1aSopenharmony_ci} 1229cabdff1aSopenharmony_ci 1230cabdff1aSopenharmony_ci/* 1231cabdff1aSopenharmony_ci * ============================================================================= 1232cabdff1aSopenharmony_ci * Description : Multiplication and addition calculation after expansion 1233cabdff1aSopenharmony_ci * of the higher half of the vector. 1234cabdff1aSopenharmony_ci * Arguments : Inputs - in_c, in_h, in_l 1235cabdff1aSopenharmony_ci * Output - out 1236cabdff1aSopenharmony_ci * Details : The in_h vector and the in_l vector are multiplied after 1237cabdff1aSopenharmony_ci * the higher half of the two-fold sign extension (signed 1238cabdff1aSopenharmony_ci * halfword to signed word), and the result is added to 1239cabdff1aSopenharmony_ci * the vector in_c, then stored to the out vector. 1240cabdff1aSopenharmony_ci * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) 1241cabdff1aSopenharmony_ci * ============================================================================= 1242cabdff1aSopenharmony_ci */ 1243cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, 1244cabdff1aSopenharmony_ci __m256i in_l) { 1245cabdff1aSopenharmony_ci __m256i tmp0, tmp1, out; 1246cabdff1aSopenharmony_ci 1247cabdff1aSopenharmony_ci tmp0 = __lasx_xvilvh_h(in_h, in_h); 1248cabdff1aSopenharmony_ci tmp1 = __lasx_xvilvh_h(in_l, in_l); 1249cabdff1aSopenharmony_ci tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1); 1250cabdff1aSopenharmony_ci out = __lasx_xvadd_w(tmp0, in_c); 1251cabdff1aSopenharmony_ci return out; 1252cabdff1aSopenharmony_ci} 1253cabdff1aSopenharmony_ci 1254cabdff1aSopenharmony_ci/* 1255cabdff1aSopenharmony_ci * ============================================================================= 1256cabdff1aSopenharmony_ci * Description : Multiplication calculation after expansion of the lower 1257cabdff1aSopenharmony_ci * half of the vector. 1258cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1259cabdff1aSopenharmony_ci * Output - out 1260cabdff1aSopenharmony_ci * Details : The in_h vector and the in_l vector are multiplied after 1261cabdff1aSopenharmony_ci * the lower half of the two-fold sign extension (signed 1262cabdff1aSopenharmony_ci * halfword to signed word), then stored to the out vector. 1263cabdff1aSopenharmony_ci * Example : out = __lasx_xvmulwl_w_h(in_h, in_l) 1264cabdff1aSopenharmony_ci * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 1265cabdff1aSopenharmony_ci * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 1266cabdff1aSopenharmony_ci * out : 6,1,3,0, 0,0,1,0 1267cabdff1aSopenharmony_ci * ============================================================================= 1268cabdff1aSopenharmony_ci */ 1269cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) { 1270cabdff1aSopenharmony_ci __m256i tmp0, tmp1, out; 1271cabdff1aSopenharmony_ci 1272cabdff1aSopenharmony_ci tmp0 = __lasx_xvsllwil_w_h(in_h, 0); 1273cabdff1aSopenharmony_ci tmp1 = __lasx_xvsllwil_w_h(in_l, 0); 1274cabdff1aSopenharmony_ci out = __lasx_xvmul_w(tmp0, tmp1); 1275cabdff1aSopenharmony_ci return out; 1276cabdff1aSopenharmony_ci} 1277cabdff1aSopenharmony_ci 1278cabdff1aSopenharmony_ci/* 1279cabdff1aSopenharmony_ci * ============================================================================= 1280cabdff1aSopenharmony_ci * Description : Multiplication calculation after expansion of the lower 1281cabdff1aSopenharmony_ci * half of the vector. 1282cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1283cabdff1aSopenharmony_ci * Output - out 1284cabdff1aSopenharmony_ci * Details : The in_h vector and the in_l vector are multiplied after 1285cabdff1aSopenharmony_ci * the lower half of the two-fold sign extension (signed 1286cabdff1aSopenharmony_ci * halfword to signed word), then stored to the out vector. 1287cabdff1aSopenharmony_ci * Example : out = __lasx_xvmulwh_w_h(in_h, in_l) 1288cabdff1aSopenharmony_ci * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 1289cabdff1aSopenharmony_ci * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 1290cabdff1aSopenharmony_ci * out : 0,0,0,0, 0,0,0,1 1291cabdff1aSopenharmony_ci * ============================================================================= 1292cabdff1aSopenharmony_ci */ 1293cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { 1294cabdff1aSopenharmony_ci __m256i tmp0, tmp1, out; 1295cabdff1aSopenharmony_ci 1296cabdff1aSopenharmony_ci tmp0 = __lasx_xvilvh_h(in_h, in_h); 1297cabdff1aSopenharmony_ci tmp1 = __lasx_xvilvh_h(in_l, in_l); 1298cabdff1aSopenharmony_ci out = __lasx_xvmulwev_w_h(tmp0, tmp1); 1299cabdff1aSopenharmony_ci return out; 1300cabdff1aSopenharmony_ci} 1301cabdff1aSopenharmony_ci 1302cabdff1aSopenharmony_ci/* 1303cabdff1aSopenharmony_ci * ============================================================================= 1304cabdff1aSopenharmony_ci * Description : The low half of the vector elements are added to the high half 1305cabdff1aSopenharmony_ci * after being doubled, then saturated. 1306cabdff1aSopenharmony_ci * Arguments : Inputs - in_h, in_l 1307cabdff1aSopenharmony_ci * Output - out 1308cabdff1aSopenharmony_ci * Details : The in_h vector adds the in_l vector after the lower half of 1309cabdff1aSopenharmony_ci * the two-fold zero extension (unsigned byte to unsigned 1310cabdff1aSopenharmony_ci * halfword) and then saturated. The results are stored to the out 1311cabdff1aSopenharmony_ci * vector. 1312cabdff1aSopenharmony_ci * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l) 1313cabdff1aSopenharmony_ci * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1 1314cabdff1aSopenharmony_ci * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 1315cabdff1aSopenharmony_ci * 0,0,0,1 1316cabdff1aSopenharmony_ci * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2, 1317cabdff1aSopenharmony_ci * ============================================================================= 1318cabdff1aSopenharmony_ci */ 1319cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { 1320cabdff1aSopenharmony_ci __m256i tmp1, out; 1321cabdff1aSopenharmony_ci __m256i zero = { 0 }; 1322cabdff1aSopenharmony_ci 1323cabdff1aSopenharmony_ci tmp1 = __lasx_xvilvl_b(zero, in_l); 1324cabdff1aSopenharmony_ci out = __lasx_xvsadd_hu(in_h, tmp1); 1325cabdff1aSopenharmony_ci return out; 1326cabdff1aSopenharmony_ci} 1327cabdff1aSopenharmony_ci 1328cabdff1aSopenharmony_ci/* 1329cabdff1aSopenharmony_ci * ============================================================================= 1330cabdff1aSopenharmony_ci * Description : Clip all halfword elements of input vector between min & max 1331cabdff1aSopenharmony_ci * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) 1332cabdff1aSopenharmony_ci * Arguments : Inputs - in (input vector) 1333cabdff1aSopenharmony_ci * - min (min threshold) 1334cabdff1aSopenharmony_ci * - max (max threshold) 1335cabdff1aSopenharmony_ci * Outputs - in (output vector with clipped elements) 1336cabdff1aSopenharmony_ci * Return Type - signed halfword 1337cabdff1aSopenharmony_ci * Example : out = __lasx_xvclip_h(in, min, max) 1338cabdff1aSopenharmony_ci * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5 1339cabdff1aSopenharmony_ci * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 1340cabdff1aSopenharmony_ci * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9 1341cabdff1aSopenharmony_ci * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5 1342cabdff1aSopenharmony_ci * ============================================================================= 1343cabdff1aSopenharmony_ci */ 1344cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) { 1345cabdff1aSopenharmony_ci __m256i out; 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_ci out = __lasx_xvmax_h(min, in); 1348cabdff1aSopenharmony_ci out = __lasx_xvmin_h(max, out); 1349cabdff1aSopenharmony_ci return out; 1350cabdff1aSopenharmony_ci} 1351cabdff1aSopenharmony_ci 1352cabdff1aSopenharmony_ci/* 1353cabdff1aSopenharmony_ci * ============================================================================= 1354cabdff1aSopenharmony_ci * Description : Clip all signed halfword elements of input vector 1355cabdff1aSopenharmony_ci * between 0 & 255 1356cabdff1aSopenharmony_ci * Arguments : Inputs - in (input vector) 1357cabdff1aSopenharmony_ci * Outputs - out (output vector with clipped elements) 1358cabdff1aSopenharmony_ci * Return Type - signed halfword 1359cabdff1aSopenharmony_ci * Example : See out = __lasx_xvclip255_w(in) 1360cabdff1aSopenharmony_ci * ============================================================================= 1361cabdff1aSopenharmony_ci */ 1362cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvclip255_h(__m256i in) { 1363cabdff1aSopenharmony_ci __m256i out; 1364cabdff1aSopenharmony_ci 1365cabdff1aSopenharmony_ci out = __lasx_xvmaxi_h(in, 0); 1366cabdff1aSopenharmony_ci out = __lasx_xvsat_hu(out, 7); 1367cabdff1aSopenharmony_ci return out; 1368cabdff1aSopenharmony_ci} 1369cabdff1aSopenharmony_ci 1370cabdff1aSopenharmony_ci/* 1371cabdff1aSopenharmony_ci * ============================================================================= 1372cabdff1aSopenharmony_ci * Description : Clip all signed word elements of input vector 1373cabdff1aSopenharmony_ci * between 0 & 255 1374cabdff1aSopenharmony_ci * Arguments : Inputs - in (input vector) 1375cabdff1aSopenharmony_ci * Output - out (output vector with clipped elements) 1376cabdff1aSopenharmony_ci * Return Type - signed word 1377cabdff1aSopenharmony_ci * Example : out = __lasx_xvclip255_w(in) 1378cabdff1aSopenharmony_ci * in : -8,255,280,249, -8,255,280,249 1379cabdff1aSopenharmony_ci * out : 0,255,255,249, 0,255,255,249 1380cabdff1aSopenharmony_ci * ============================================================================= 1381cabdff1aSopenharmony_ci */ 1382cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvclip255_w(__m256i in) { 1383cabdff1aSopenharmony_ci __m256i out; 1384cabdff1aSopenharmony_ci 1385cabdff1aSopenharmony_ci out = __lasx_xvmaxi_w(in, 0); 1386cabdff1aSopenharmony_ci out = __lasx_xvsat_wu(out, 7); 1387cabdff1aSopenharmony_ci return out; 1388cabdff1aSopenharmony_ci} 1389cabdff1aSopenharmony_ci 1390cabdff1aSopenharmony_ci/* 1391cabdff1aSopenharmony_ci * ============================================================================= 1392cabdff1aSopenharmony_ci * Description : Indexed halfword element values are replicated to all 1393cabdff1aSopenharmony_ci * elements in output vector. If 'idx < 8' use xvsplati_l_*, 1394cabdff1aSopenharmony_ci * if 'idx >= 8' use xvsplati_h_*. 1395cabdff1aSopenharmony_ci * Arguments : Inputs - in, idx 1396cabdff1aSopenharmony_ci * Output - out 1397cabdff1aSopenharmony_ci * Details : Idx element value from in vector is replicated to all 1398cabdff1aSopenharmony_ci * elements in out vector. 1399cabdff1aSopenharmony_ci * Valid index range for halfword operation is 0-7 1400cabdff1aSopenharmony_ci * Example : out = __lasx_xvsplati_l_h(in, idx) 1401cabdff1aSopenharmony_ci * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0 1402cabdff1aSopenharmony_ci * idx : 0x02 1403cabdff1aSopenharmony_ci * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11 1404cabdff1aSopenharmony_ci * ============================================================================= 1405cabdff1aSopenharmony_ci */ 1406cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) { 1407cabdff1aSopenharmony_ci __m256i out; 1408cabdff1aSopenharmony_ci 1409cabdff1aSopenharmony_ci out = __lasx_xvpermi_q(in, in, 0x02); 1410cabdff1aSopenharmony_ci out = __lasx_xvreplve_h(out, idx); 1411cabdff1aSopenharmony_ci return out; 1412cabdff1aSopenharmony_ci} 1413cabdff1aSopenharmony_ci 1414cabdff1aSopenharmony_ci/* 1415cabdff1aSopenharmony_ci * ============================================================================= 1416cabdff1aSopenharmony_ci * Description : Indexed halfword element values are replicated to all 1417cabdff1aSopenharmony_ci * elements in output vector. If 'idx < 8' use xvsplati_l_*, 1418cabdff1aSopenharmony_ci * if 'idx >= 8' use xvsplati_h_*. 1419cabdff1aSopenharmony_ci * Arguments : Inputs - in, idx 1420cabdff1aSopenharmony_ci * Output - out 1421cabdff1aSopenharmony_ci * Details : Idx element value from in vector is replicated to all 1422cabdff1aSopenharmony_ci * elements in out vector. 1423cabdff1aSopenharmony_ci * Valid index range for halfword operation is 0-7 1424cabdff1aSopenharmony_ci * Example : out = __lasx_xvsplati_h_h(in, idx) 1425cabdff1aSopenharmony_ci * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0 1426cabdff1aSopenharmony_ci * idx : 0x09 1427cabdff1aSopenharmony_ci * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 1428cabdff1aSopenharmony_ci * ============================================================================= 1429cabdff1aSopenharmony_ci */ 1430cabdff1aSopenharmony_cistatic inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { 1431cabdff1aSopenharmony_ci __m256i out; 1432cabdff1aSopenharmony_ci 1433cabdff1aSopenharmony_ci out = __lasx_xvpermi_q(in, in, 0x13); 1434cabdff1aSopenharmony_ci out = __lasx_xvreplve_h(out, idx); 1435cabdff1aSopenharmony_ci return out; 1436cabdff1aSopenharmony_ci} 1437cabdff1aSopenharmony_ci 1438cabdff1aSopenharmony_ci/* 1439cabdff1aSopenharmony_ci * ============================================================================= 1440cabdff1aSopenharmony_ci * Description : Transpose 4x4 block with double-word elements in vectors 1441cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3 1442cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3 1443cabdff1aSopenharmony_ci * Example : LASX_TRANSPOSE4x4_D 1444cabdff1aSopenharmony_ci * _in0 : 1,2,3,4 1445cabdff1aSopenharmony_ci * _in1 : 1,2,3,4 1446cabdff1aSopenharmony_ci * _in2 : 1,2,3,4 1447cabdff1aSopenharmony_ci * _in3 : 1,2,3,4 1448cabdff1aSopenharmony_ci * 1449cabdff1aSopenharmony_ci * _out0 : 1,1,1,1 1450cabdff1aSopenharmony_ci * _out1 : 2,2,2,2 1451cabdff1aSopenharmony_ci * _out2 : 3,3,3,3 1452cabdff1aSopenharmony_ci * _out3 : 4,4,4,4 1453cabdff1aSopenharmony_ci * ============================================================================= 1454cabdff1aSopenharmony_ci */ 1455cabdff1aSopenharmony_ci#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ 1456cabdff1aSopenharmony_ci _out3) \ 1457cabdff1aSopenharmony_ci { \ 1458cabdff1aSopenharmony_ci __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ 1459cabdff1aSopenharmony_ci _tmp0 = __lasx_xvilvl_d(_in1, _in0); \ 1460cabdff1aSopenharmony_ci _tmp1 = __lasx_xvilvh_d(_in1, _in0); \ 1461cabdff1aSopenharmony_ci _tmp2 = __lasx_xvilvl_d(_in3, _in2); \ 1462cabdff1aSopenharmony_ci _tmp3 = __lasx_xvilvh_d(_in3, _in2); \ 1463cabdff1aSopenharmony_ci _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \ 1464cabdff1aSopenharmony_ci _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \ 1465cabdff1aSopenharmony_ci _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \ 1466cabdff1aSopenharmony_ci _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \ 1467cabdff1aSopenharmony_ci } 1468cabdff1aSopenharmony_ci 1469cabdff1aSopenharmony_ci/* 1470cabdff1aSopenharmony_ci * ============================================================================= 1471cabdff1aSopenharmony_ci * Description : Transpose 8x8 block with word elements in vectors 1472cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 1473cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 1474cabdff1aSopenharmony_ci * _out7 1475cabdff1aSopenharmony_ci * Example : LASX_TRANSPOSE8x8_W 1476cabdff1aSopenharmony_ci * _in0 : 1,2,3,4,5,6,7,8 1477cabdff1aSopenharmony_ci * _in1 : 2,2,3,4,5,6,7,8 1478cabdff1aSopenharmony_ci * _in2 : 3,2,3,4,5,6,7,8 1479cabdff1aSopenharmony_ci * _in3 : 4,2,3,4,5,6,7,8 1480cabdff1aSopenharmony_ci * _in4 : 5,2,3,4,5,6,7,8 1481cabdff1aSopenharmony_ci * _in5 : 6,2,3,4,5,6,7,8 1482cabdff1aSopenharmony_ci * _in6 : 7,2,3,4,5,6,7,8 1483cabdff1aSopenharmony_ci * _in7 : 8,2,3,4,5,6,7,8 1484cabdff1aSopenharmony_ci * 1485cabdff1aSopenharmony_ci * _out0 : 1,2,3,4,5,6,7,8 1486cabdff1aSopenharmony_ci * _out1 : 2,2,2,2,2,2,2,2 1487cabdff1aSopenharmony_ci * _out2 : 3,3,3,3,3,3,3,3 1488cabdff1aSopenharmony_ci * _out3 : 4,4,4,4,4,4,4,4 1489cabdff1aSopenharmony_ci * _out4 : 5,5,5,5,5,5,5,5 1490cabdff1aSopenharmony_ci * _out5 : 6,6,6,6,6,6,6,6 1491cabdff1aSopenharmony_ci * _out6 : 7,7,7,7,7,7,7,7 1492cabdff1aSopenharmony_ci * _out7 : 8,8,8,8,8,8,8,8 1493cabdff1aSopenharmony_ci * ============================================================================= 1494cabdff1aSopenharmony_ci */ 1495cabdff1aSopenharmony_ci#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1496cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1497cabdff1aSopenharmony_ci _out7) \ 1498cabdff1aSopenharmony_ci { \ 1499cabdff1aSopenharmony_ci __m256i _s0_m, _s1_m; \ 1500cabdff1aSopenharmony_ci __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1501cabdff1aSopenharmony_ci __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1502cabdff1aSopenharmony_ci \ 1503cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvl_w(_in2, _in0); \ 1504cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvl_w(_in3, _in1); \ 1505cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1506cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1507cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvh_w(_in2, _in0); \ 1508cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvh_w(_in3, _in1); \ 1509cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1510cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1511cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvl_w(_in6, _in4); \ 1512cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvl_w(_in7, _in5); \ 1513cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1514cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1515cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvh_w(_in6, _in4); \ 1516cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvh_w(_in7, _in5); \ 1517cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1518cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1519cabdff1aSopenharmony_ci _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \ 1520cabdff1aSopenharmony_ci _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \ 1521cabdff1aSopenharmony_ci _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \ 1522cabdff1aSopenharmony_ci _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \ 1523cabdff1aSopenharmony_ci _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \ 1524cabdff1aSopenharmony_ci _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \ 1525cabdff1aSopenharmony_ci _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \ 1526cabdff1aSopenharmony_ci _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \ 1527cabdff1aSopenharmony_ci } 1528cabdff1aSopenharmony_ci 1529cabdff1aSopenharmony_ci/* 1530cabdff1aSopenharmony_ci * ============================================================================= 1531cabdff1aSopenharmony_ci * Description : Transpose input 16x8 byte block 1532cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, 1533cabdff1aSopenharmony_ci * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 1534cabdff1aSopenharmony_ci * (input 16x8 byte block) 1535cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 1536cabdff1aSopenharmony_ci * _out7 (output 8x16 byte block) 1537cabdff1aSopenharmony_ci * Details : The rows of the matrix become columns, and the columns become 1538cabdff1aSopenharmony_ci * rows. 1539cabdff1aSopenharmony_ci * Example : See LASX_TRANSPOSE16x8_H 1540cabdff1aSopenharmony_ci * ============================================================================= 1541cabdff1aSopenharmony_ci */ 1542cabdff1aSopenharmony_ci#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1543cabdff1aSopenharmony_ci _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ 1544cabdff1aSopenharmony_ci _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ 1545cabdff1aSopenharmony_ci _out6, _out7) \ 1546cabdff1aSopenharmony_ci { \ 1547cabdff1aSopenharmony_ci __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1548cabdff1aSopenharmony_ci __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1549cabdff1aSopenharmony_ci \ 1550cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ 1551cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ 1552cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ 1553cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ 1554cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \ 1555cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \ 1556cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \ 1557cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \ 1558cabdff1aSopenharmony_ci _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ 1559cabdff1aSopenharmony_ci _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ 1560cabdff1aSopenharmony_ci _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ 1561cabdff1aSopenharmony_ci _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ 1562cabdff1aSopenharmony_ci _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \ 1563cabdff1aSopenharmony_ci _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \ 1564cabdff1aSopenharmony_ci _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \ 1565cabdff1aSopenharmony_ci _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \ 1566cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \ 1567cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \ 1568cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \ 1569cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \ 1570cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \ 1571cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \ 1572cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \ 1573cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \ 1574cabdff1aSopenharmony_ci _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \ 1575cabdff1aSopenharmony_ci _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \ 1576cabdff1aSopenharmony_ci _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \ 1577cabdff1aSopenharmony_ci _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \ 1578cabdff1aSopenharmony_ci _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \ 1579cabdff1aSopenharmony_ci _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \ 1580cabdff1aSopenharmony_ci _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \ 1581cabdff1aSopenharmony_ci _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \ 1582cabdff1aSopenharmony_ci } 1583cabdff1aSopenharmony_ci 1584cabdff1aSopenharmony_ci/* 1585cabdff1aSopenharmony_ci * ============================================================================= 1586cabdff1aSopenharmony_ci * Description : Transpose input 16x8 byte block 1587cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, 1588cabdff1aSopenharmony_ci * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 1589cabdff1aSopenharmony_ci * (input 16x8 byte block) 1590cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 1591cabdff1aSopenharmony_ci * _out7 (output 8x16 byte block) 1592cabdff1aSopenharmony_ci * Details : The rows of the matrix become columns, and the columns become 1593cabdff1aSopenharmony_ci * rows. 1594cabdff1aSopenharmony_ci * Example : LASX_TRANSPOSE16x8_H 1595cabdff1aSopenharmony_ci * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1596cabdff1aSopenharmony_ci * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1597cabdff1aSopenharmony_ci * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1598cabdff1aSopenharmony_ci * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1599cabdff1aSopenharmony_ci * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1600cabdff1aSopenharmony_ci * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1601cabdff1aSopenharmony_ci * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1602cabdff1aSopenharmony_ci * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1603cabdff1aSopenharmony_ci * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1604cabdff1aSopenharmony_ci * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1605cabdff1aSopenharmony_ci * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1606cabdff1aSopenharmony_ci * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1607cabdff1aSopenharmony_ci * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1608cabdff1aSopenharmony_ci * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1609cabdff1aSopenharmony_ci * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1610cabdff1aSopenharmony_ci * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 1611cabdff1aSopenharmony_ci * 1612cabdff1aSopenharmony_ci * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6 1613cabdff1aSopenharmony_ci * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 1614cabdff1aSopenharmony_ci * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 1615cabdff1aSopenharmony_ci * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4 1616cabdff1aSopenharmony_ci * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 1617cabdff1aSopenharmony_ci * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 1618cabdff1aSopenharmony_ci * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 1619cabdff1aSopenharmony_ci * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 1620cabdff1aSopenharmony_ci * ============================================================================= 1621cabdff1aSopenharmony_ci */ 1622cabdff1aSopenharmony_ci#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1623cabdff1aSopenharmony_ci _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ 1624cabdff1aSopenharmony_ci _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ 1625cabdff1aSopenharmony_ci _out6, _out7) \ 1626cabdff1aSopenharmony_ci { \ 1627cabdff1aSopenharmony_ci __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1628cabdff1aSopenharmony_ci __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1629cabdff1aSopenharmony_ci __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ 1630cabdff1aSopenharmony_ci \ 1631cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \ 1632cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \ 1633cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \ 1634cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \ 1635cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \ 1636cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \ 1637cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \ 1638cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \ 1639cabdff1aSopenharmony_ci _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ 1640cabdff1aSopenharmony_ci _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ 1641cabdff1aSopenharmony_ci _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ 1642cabdff1aSopenharmony_ci _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ 1643cabdff1aSopenharmony_ci _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ 1644cabdff1aSopenharmony_ci _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ 1645cabdff1aSopenharmony_ci _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ 1646cabdff1aSopenharmony_ci _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ 1647cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ 1648cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ 1649cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ 1650cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ 1651cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ 1652cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ 1653cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ 1654cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ 1655cabdff1aSopenharmony_ci _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ 1656cabdff1aSopenharmony_ci _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ 1657cabdff1aSopenharmony_ci _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ 1658cabdff1aSopenharmony_ci _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ 1659cabdff1aSopenharmony_ci \ 1660cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \ 1661cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \ 1662cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \ 1663cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \ 1664cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \ 1665cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \ 1666cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \ 1667cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \ 1668cabdff1aSopenharmony_ci _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ 1669cabdff1aSopenharmony_ci _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ 1670cabdff1aSopenharmony_ci _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ 1671cabdff1aSopenharmony_ci _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ 1672cabdff1aSopenharmony_ci _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ 1673cabdff1aSopenharmony_ci _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ 1674cabdff1aSopenharmony_ci _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ 1675cabdff1aSopenharmony_ci _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ 1676cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ 1677cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ 1678cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ 1679cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ 1680cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ 1681cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ 1682cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ 1683cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ 1684cabdff1aSopenharmony_ci _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ 1685cabdff1aSopenharmony_ci _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ 1686cabdff1aSopenharmony_ci _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ 1687cabdff1aSopenharmony_ci _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ 1688cabdff1aSopenharmony_ci } 1689cabdff1aSopenharmony_ci 1690cabdff1aSopenharmony_ci/* 1691cabdff1aSopenharmony_ci * ============================================================================= 1692cabdff1aSopenharmony_ci * Description : Transpose 4x4 block with halfword elements in vectors 1693cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3 1694cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3 1695cabdff1aSopenharmony_ci * Return Type - signed halfword 1696cabdff1aSopenharmony_ci * Details : The rows of the matrix become columns, and the columns become 1697cabdff1aSopenharmony_ci * rows. 1698cabdff1aSopenharmony_ci * Example : See LASX_TRANSPOSE8x8_H 1699cabdff1aSopenharmony_ci * ============================================================================= 1700cabdff1aSopenharmony_ci */ 1701cabdff1aSopenharmony_ci#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ 1702cabdff1aSopenharmony_ci _out3) \ 1703cabdff1aSopenharmony_ci { \ 1704cabdff1aSopenharmony_ci __m256i _s0_m, _s1_m; \ 1705cabdff1aSopenharmony_ci \ 1706cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvl_h(_in1, _in0); \ 1707cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvl_h(_in3, _in2); \ 1708cabdff1aSopenharmony_ci _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \ 1709cabdff1aSopenharmony_ci _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \ 1710cabdff1aSopenharmony_ci _out1 = __lasx_xvilvh_d(_out0, _out0); \ 1711cabdff1aSopenharmony_ci _out3 = __lasx_xvilvh_d(_out2, _out2); \ 1712cabdff1aSopenharmony_ci } 1713cabdff1aSopenharmony_ci 1714cabdff1aSopenharmony_ci/* 1715cabdff1aSopenharmony_ci * ============================================================================= 1716cabdff1aSopenharmony_ci * Description : Transpose input 8x8 byte block 1717cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 1718cabdff1aSopenharmony_ci * (input 8x8 byte block) 1719cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, 1720cabdff1aSopenharmony_ci * _out7 (output 8x8 byte block) 1721cabdff1aSopenharmony_ci * Example : See LASX_TRANSPOSE8x8_H 1722cabdff1aSopenharmony_ci * ============================================================================= 1723cabdff1aSopenharmony_ci */ 1724cabdff1aSopenharmony_ci#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1725cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1726cabdff1aSopenharmony_ci _out7) \ 1727cabdff1aSopenharmony_ci { \ 1728cabdff1aSopenharmony_ci __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1729cabdff1aSopenharmony_ci __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1730cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ 1731cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ 1732cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ 1733cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ 1734cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ 1735cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ 1736cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ 1737cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ 1738cabdff1aSopenharmony_ci _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \ 1739cabdff1aSopenharmony_ci _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \ 1740cabdff1aSopenharmony_ci _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \ 1741cabdff1aSopenharmony_ci _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \ 1742cabdff1aSopenharmony_ci _out1 = __lasx_xvbsrl_v(_out0, 8); \ 1743cabdff1aSopenharmony_ci _out3 = __lasx_xvbsrl_v(_out2, 8); \ 1744cabdff1aSopenharmony_ci _out5 = __lasx_xvbsrl_v(_out4, 8); \ 1745cabdff1aSopenharmony_ci _out7 = __lasx_xvbsrl_v(_out6, 8); \ 1746cabdff1aSopenharmony_ci } 1747cabdff1aSopenharmony_ci 1748cabdff1aSopenharmony_ci/* 1749cabdff1aSopenharmony_ci * ============================================================================= 1750cabdff1aSopenharmony_ci * Description : Transpose 8x8 block with halfword elements in vectors. 1751cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, ~ 1752cabdff1aSopenharmony_ci * Outputs - _out0, _out1, ~ 1753cabdff1aSopenharmony_ci * Details : The rows of the matrix become columns, and the columns become 1754cabdff1aSopenharmony_ci * rows. 1755cabdff1aSopenharmony_ci * Example : LASX_TRANSPOSE8x8_H 1756cabdff1aSopenharmony_ci * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 1757cabdff1aSopenharmony_ci * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 1758cabdff1aSopenharmony_ci * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 1759cabdff1aSopenharmony_ci * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 1760cabdff1aSopenharmony_ci * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 1761cabdff1aSopenharmony_ci * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 1762cabdff1aSopenharmony_ci * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 1763cabdff1aSopenharmony_ci * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 1764cabdff1aSopenharmony_ci * 1765cabdff1aSopenharmony_ci * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9 1766cabdff1aSopenharmony_ci * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 1767cabdff1aSopenharmony_ci * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3 1768cabdff1aSopenharmony_ci * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4 1769cabdff1aSopenharmony_ci * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5 1770cabdff1aSopenharmony_ci * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6 1771cabdff1aSopenharmony_ci * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7 1772cabdff1aSopenharmony_ci * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8 1773cabdff1aSopenharmony_ci * ============================================================================= 1774cabdff1aSopenharmony_ci */ 1775cabdff1aSopenharmony_ci#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1776cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1777cabdff1aSopenharmony_ci _out7) \ 1778cabdff1aSopenharmony_ci { \ 1779cabdff1aSopenharmony_ci __m256i _s0_m, _s1_m; \ 1780cabdff1aSopenharmony_ci __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ 1781cabdff1aSopenharmony_ci __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ 1782cabdff1aSopenharmony_ci \ 1783cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvl_h(_in6, _in4); \ 1784cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvl_h(_in7, _in5); \ 1785cabdff1aSopenharmony_ci _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ 1786cabdff1aSopenharmony_ci _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ 1787cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvh_h(_in6, _in4); \ 1788cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvh_h(_in7, _in5); \ 1789cabdff1aSopenharmony_ci _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ 1790cabdff1aSopenharmony_ci _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ 1791cabdff1aSopenharmony_ci \ 1792cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvl_h(_in2, _in0); \ 1793cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvl_h(_in3, _in1); \ 1794cabdff1aSopenharmony_ci _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ 1795cabdff1aSopenharmony_ci _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ 1796cabdff1aSopenharmony_ci _s0_m = __lasx_xvilvh_h(_in2, _in0); \ 1797cabdff1aSopenharmony_ci _s1_m = __lasx_xvilvh_h(_in3, _in1); \ 1798cabdff1aSopenharmony_ci _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ 1799cabdff1aSopenharmony_ci _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ 1800cabdff1aSopenharmony_ci \ 1801cabdff1aSopenharmony_ci _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \ 1802cabdff1aSopenharmony_ci _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \ 1803cabdff1aSopenharmony_ci _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \ 1804cabdff1aSopenharmony_ci _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \ 1805cabdff1aSopenharmony_ci _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \ 1806cabdff1aSopenharmony_ci _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \ 1807cabdff1aSopenharmony_ci _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \ 1808cabdff1aSopenharmony_ci _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \ 1809cabdff1aSopenharmony_ci } 1810cabdff1aSopenharmony_ci 1811cabdff1aSopenharmony_ci/* 1812cabdff1aSopenharmony_ci * ============================================================================= 1813cabdff1aSopenharmony_ci * Description : Butterfly of 4 input vectors 1814cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3 1815cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3 1816cabdff1aSopenharmony_ci * Details : Butterfly operation 1817cabdff1aSopenharmony_ci * Example : LASX_BUTTERFLY_4 1818cabdff1aSopenharmony_ci * _out0 = _in0 + _in3; 1819cabdff1aSopenharmony_ci * _out1 = _in1 + _in2; 1820cabdff1aSopenharmony_ci * _out2 = _in1 - _in2; 1821cabdff1aSopenharmony_ci * _out3 = _in0 - _in3; 1822cabdff1aSopenharmony_ci * ============================================================================= 1823cabdff1aSopenharmony_ci */ 1824cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 1825cabdff1aSopenharmony_ci { \ 1826cabdff1aSopenharmony_ci _out0 = __lasx_xvadd_b(_in0, _in3); \ 1827cabdff1aSopenharmony_ci _out1 = __lasx_xvadd_b(_in1, _in2); \ 1828cabdff1aSopenharmony_ci _out2 = __lasx_xvsub_b(_in1, _in2); \ 1829cabdff1aSopenharmony_ci _out3 = __lasx_xvsub_b(_in0, _in3); \ 1830cabdff1aSopenharmony_ci } 1831cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 1832cabdff1aSopenharmony_ci { \ 1833cabdff1aSopenharmony_ci _out0 = __lasx_xvadd_h(_in0, _in3); \ 1834cabdff1aSopenharmony_ci _out1 = __lasx_xvadd_h(_in1, _in2); \ 1835cabdff1aSopenharmony_ci _out2 = __lasx_xvsub_h(_in1, _in2); \ 1836cabdff1aSopenharmony_ci _out3 = __lasx_xvsub_h(_in0, _in3); \ 1837cabdff1aSopenharmony_ci } 1838cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 1839cabdff1aSopenharmony_ci { \ 1840cabdff1aSopenharmony_ci _out0 = __lasx_xvadd_w(_in0, _in3); \ 1841cabdff1aSopenharmony_ci _out1 = __lasx_xvadd_w(_in1, _in2); \ 1842cabdff1aSopenharmony_ci _out2 = __lasx_xvsub_w(_in1, _in2); \ 1843cabdff1aSopenharmony_ci _out3 = __lasx_xvsub_w(_in0, _in3); \ 1844cabdff1aSopenharmony_ci } 1845cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ 1846cabdff1aSopenharmony_ci { \ 1847cabdff1aSopenharmony_ci _out0 = __lasx_xvadd_d(_in0, _in3); \ 1848cabdff1aSopenharmony_ci _out1 = __lasx_xvadd_d(_in1, _in2); \ 1849cabdff1aSopenharmony_ci _out2 = __lasx_xvsub_d(_in1, _in2); \ 1850cabdff1aSopenharmony_ci _out3 = __lasx_xvsub_d(_in0, _in3); \ 1851cabdff1aSopenharmony_ci } 1852cabdff1aSopenharmony_ci 1853cabdff1aSopenharmony_ci/* 1854cabdff1aSopenharmony_ci * ============================================================================= 1855cabdff1aSopenharmony_ci * Description : Butterfly of 8 input vectors 1856cabdff1aSopenharmony_ci * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ 1857cabdff1aSopenharmony_ci * Outputs - _out0, _out1, _out2, _out3, ~ 1858cabdff1aSopenharmony_ci * Details : Butterfly operation 1859cabdff1aSopenharmony_ci * Example : LASX_BUTTERFLY_8 1860cabdff1aSopenharmony_ci * _out0 = _in0 + _in7; 1861cabdff1aSopenharmony_ci * _out1 = _in1 + _in6; 1862cabdff1aSopenharmony_ci * _out2 = _in2 + _in5; 1863cabdff1aSopenharmony_ci * _out3 = _in3 + _in4; 1864cabdff1aSopenharmony_ci * _out4 = _in3 - _in4; 1865cabdff1aSopenharmony_ci * _out5 = _in2 - _in5; 1866cabdff1aSopenharmony_ci * _out6 = _in1 - _in6; 1867cabdff1aSopenharmony_ci * _out7 = _in0 - _in7; 1868cabdff1aSopenharmony_ci * ============================================================================= 1869cabdff1aSopenharmony_ci */ 1870cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1871cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1872cabdff1aSopenharmony_ci _out7) \ 1873cabdff1aSopenharmony_ci { \ 1874cabdff1aSopenharmony_ci _out0 = __lasx_xvadd_b(_in0, _in7); \ 1875cabdff1aSopenharmony_ci _out1 = __lasx_xvadd_b(_in1, _in6); \ 1876cabdff1aSopenharmony_ci _out2 = __lasx_xvadd_b(_in2, _in5); \ 1877cabdff1aSopenharmony_ci _out3 = __lasx_xvadd_b(_in3, _in4); \ 1878cabdff1aSopenharmony_ci _out4 = __lasx_xvsub_b(_in3, _in4); \ 1879cabdff1aSopenharmony_ci _out5 = __lasx_xvsub_b(_in2, _in5); \ 1880cabdff1aSopenharmony_ci _out6 = __lasx_xvsub_b(_in1, _in6); \ 1881cabdff1aSopenharmony_ci _out7 = __lasx_xvsub_b(_in0, _in7); \ 1882cabdff1aSopenharmony_ci } 1883cabdff1aSopenharmony_ci 1884cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1885cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1886cabdff1aSopenharmony_ci _out7) \ 1887cabdff1aSopenharmony_ci { \ 1888cabdff1aSopenharmony_ci _out0 = __lasx_xvadd_h(_in0, _in7); \ 1889cabdff1aSopenharmony_ci _out1 = __lasx_xvadd_h(_in1, _in6); \ 1890cabdff1aSopenharmony_ci _out2 = __lasx_xvadd_h(_in2, _in5); \ 1891cabdff1aSopenharmony_ci _out3 = __lasx_xvadd_h(_in3, _in4); \ 1892cabdff1aSopenharmony_ci _out4 = __lasx_xvsub_h(_in3, _in4); \ 1893cabdff1aSopenharmony_ci _out5 = __lasx_xvsub_h(_in2, _in5); \ 1894cabdff1aSopenharmony_ci _out6 = __lasx_xvsub_h(_in1, _in6); \ 1895cabdff1aSopenharmony_ci _out7 = __lasx_xvsub_h(_in0, _in7); \ 1896cabdff1aSopenharmony_ci } 1897cabdff1aSopenharmony_ci 1898cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1899cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1900cabdff1aSopenharmony_ci _out7) \ 1901cabdff1aSopenharmony_ci { \ 1902cabdff1aSopenharmony_ci _out0 = __lasx_xvadd_w(_in0, _in7); \ 1903cabdff1aSopenharmony_ci _out1 = __lasx_xvadd_w(_in1, _in6); \ 1904cabdff1aSopenharmony_ci _out2 = __lasx_xvadd_w(_in2, _in5); \ 1905cabdff1aSopenharmony_ci _out3 = __lasx_xvadd_w(_in3, _in4); \ 1906cabdff1aSopenharmony_ci _out4 = __lasx_xvsub_w(_in3, _in4); \ 1907cabdff1aSopenharmony_ci _out5 = __lasx_xvsub_w(_in2, _in5); \ 1908cabdff1aSopenharmony_ci _out6 = __lasx_xvsub_w(_in1, _in6); \ 1909cabdff1aSopenharmony_ci _out7 = __lasx_xvsub_w(_in0, _in7); \ 1910cabdff1aSopenharmony_ci } 1911cabdff1aSopenharmony_ci 1912cabdff1aSopenharmony_ci#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ 1913cabdff1aSopenharmony_ci _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ 1914cabdff1aSopenharmony_ci _out7) \ 1915cabdff1aSopenharmony_ci { \ 1916cabdff1aSopenharmony_ci _out0 = __lasx_xvadd_d(_in0, _in7); \ 1917cabdff1aSopenharmony_ci _out1 = __lasx_xvadd_d(_in1, _in6); \ 1918cabdff1aSopenharmony_ci _out2 = __lasx_xvadd_d(_in2, _in5); \ 1919cabdff1aSopenharmony_ci _out3 = __lasx_xvadd_d(_in3, _in4); \ 1920cabdff1aSopenharmony_ci _out4 = __lasx_xvsub_d(_in3, _in4); \ 1921cabdff1aSopenharmony_ci _out5 = __lasx_xvsub_d(_in2, _in5); \ 1922cabdff1aSopenharmony_ci _out6 = __lasx_xvsub_d(_in1, _in6); \ 1923cabdff1aSopenharmony_ci _out7 = __lasx_xvsub_d(_in0, _in7); \ 1924cabdff1aSopenharmony_ci } 1925cabdff1aSopenharmony_ci 1926cabdff1aSopenharmony_ci#endif // LASX 1927cabdff1aSopenharmony_ci 1928cabdff1aSopenharmony_ci/* 1929cabdff1aSopenharmony_ci * ============================================================================= 1930cabdff1aSopenharmony_ci * Description : Print out elements in vector. 1931cabdff1aSopenharmony_ci * Arguments : Inputs - RTYPE, _element_num, _in0, _enter 1932cabdff1aSopenharmony_ci * Outputs - 1933cabdff1aSopenharmony_ci * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if 1934cabdff1aSopenharmony_ci * '_enter' is TRUE, prefix "\nVP:" will be added first. 1935cabdff1aSopenharmony_ci * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4 1936cabdff1aSopenharmony_ci * VP:1,2,3,4, 1937cabdff1aSopenharmony_ci * ============================================================================= 1938cabdff1aSopenharmony_ci */ 1939cabdff1aSopenharmony_ci#define VECT_PRINT(RTYPE, element_num, in0, enter) \ 1940cabdff1aSopenharmony_ci { \ 1941cabdff1aSopenharmony_ci RTYPE _tmp0 = (RTYPE)in0; \ 1942cabdff1aSopenharmony_ci int _i = 0; \ 1943cabdff1aSopenharmony_ci if (enter) printf("\nVP:"); \ 1944cabdff1aSopenharmony_ci for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \ 1945cabdff1aSopenharmony_ci } 1946cabdff1aSopenharmony_ci 1947cabdff1aSopenharmony_ci#endif /* LOONGSON_INTRINSICS_H */ 1948cabdff1aSopenharmony_ci#endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */ 1949