1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H 22cabdff1aSopenharmony_ci#define AVUTIL_MIPS_GENERIC_MACROS_MSA_H 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci#include <stdint.h> 25cabdff1aSopenharmony_ci#include <msa.h> 26cabdff1aSopenharmony_ci#include <config.h> 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ci#define ALIGNMENT 16 29cabdff1aSopenharmony_ci#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1))) 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ci#define LD_V(RTYPE, psrc) *((RTYPE *)(psrc)) 32cabdff1aSopenharmony_ci#define LD_UB(...) LD_V(v16u8, __VA_ARGS__) 33cabdff1aSopenharmony_ci#define LD_SB(...) LD_V(v16i8, __VA_ARGS__) 34cabdff1aSopenharmony_ci#define LD_UH(...) LD_V(v8u16, __VA_ARGS__) 35cabdff1aSopenharmony_ci#define LD_SH(...) LD_V(v8i16, __VA_ARGS__) 36cabdff1aSopenharmony_ci#define LD_UW(...) LD_V(v4u32, __VA_ARGS__) 37cabdff1aSopenharmony_ci#define LD_SW(...) LD_V(v4i32, __VA_ARGS__) 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 40cabdff1aSopenharmony_ci#define ST_UB(...) ST_V(v16u8, __VA_ARGS__) 41cabdff1aSopenharmony_ci#define ST_SB(...) ST_V(v16i8, __VA_ARGS__) 42cabdff1aSopenharmony_ci#define ST_UH(...) ST_V(v8u16, __VA_ARGS__) 43cabdff1aSopenharmony_ci#define ST_SH(...) ST_V(v8i16, __VA_ARGS__) 44cabdff1aSopenharmony_ci#define ST_UW(...) ST_V(v4u32, __VA_ARGS__) 45cabdff1aSopenharmony_ci#define ST_SW(...) ST_V(v4i32, __VA_ARGS__) 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci#if (__mips_isa_rev >= 6) 48cabdff1aSopenharmony_ci #define LH(psrc) \ 49cabdff1aSopenharmony_ci ( { \ 50cabdff1aSopenharmony_ci uint16_t val_lh_m = *(uint16_t *)(psrc); \ 51cabdff1aSopenharmony_ci val_lh_m; \ 52cabdff1aSopenharmony_ci } ) 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci #define LW(psrc) \ 55cabdff1aSopenharmony_ci ( { \ 56cabdff1aSopenharmony_ci uint32_t val_lw_m = *(uint32_t *)(psrc); \ 57cabdff1aSopenharmony_ci val_lw_m; \ 58cabdff1aSopenharmony_ci } ) 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_ci #if (__mips == 64) 61cabdff1aSopenharmony_ci #define LD(psrc) \ 62cabdff1aSopenharmony_ci ( { \ 63cabdff1aSopenharmony_ci uint64_t val_ld_m = *(uint64_t *)(psrc); \ 64cabdff1aSopenharmony_ci val_ld_m; \ 65cabdff1aSopenharmony_ci } ) 66cabdff1aSopenharmony_ci #else // !(__mips == 64) 67cabdff1aSopenharmony_ci #define LD(psrc) \ 68cabdff1aSopenharmony_ci ( { \ 69cabdff1aSopenharmony_ci uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 70cabdff1aSopenharmony_ci uint32_t val0_ld_m, val1_ld_m; \ 71cabdff1aSopenharmony_ci uint64_t val_ld_m = 0; \ 72cabdff1aSopenharmony_ci \ 73cabdff1aSopenharmony_ci val0_ld_m = LW(psrc_ld_m); \ 74cabdff1aSopenharmony_ci val1_ld_m = LW(psrc_ld_m + 4); \ 75cabdff1aSopenharmony_ci \ 76cabdff1aSopenharmony_ci val_ld_m = (uint64_t) (val1_ld_m); \ 77cabdff1aSopenharmony_ci val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 78cabdff1aSopenharmony_ci val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \ 79cabdff1aSopenharmony_ci \ 80cabdff1aSopenharmony_ci val_ld_m; \ 81cabdff1aSopenharmony_ci } ) 82cabdff1aSopenharmony_ci #endif // (__mips == 64) 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci #define SH(val, pdst) *(uint16_t *)(pdst) = (val); 85cabdff1aSopenharmony_ci #define SW(val, pdst) *(uint32_t *)(pdst) = (val); 86cabdff1aSopenharmony_ci #define SD(val, pdst) *(uint64_t *)(pdst) = (val); 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci#else // !(__mips_isa_rev >= 6) 89cabdff1aSopenharmony_ci #define LH(psrc) \ 90cabdff1aSopenharmony_ci ( { \ 91cabdff1aSopenharmony_ci uint8_t *psrc_lh_m = (uint8_t *) (psrc); \ 92cabdff1aSopenharmony_ci uint16_t val_lh_m; \ 93cabdff1aSopenharmony_ci \ 94cabdff1aSopenharmony_ci __asm__ volatile ( \ 95cabdff1aSopenharmony_ci "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ 96cabdff1aSopenharmony_ci \ 97cabdff1aSopenharmony_ci : [val_lh_m] "=r" (val_lh_m) \ 98cabdff1aSopenharmony_ci : [psrc_lh_m] "m" (*psrc_lh_m) \ 99cabdff1aSopenharmony_ci ); \ 100cabdff1aSopenharmony_ci \ 101cabdff1aSopenharmony_ci val_lh_m; \ 102cabdff1aSopenharmony_ci } ) 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_ci #define LW(psrc) \ 105cabdff1aSopenharmony_ci ( { \ 106cabdff1aSopenharmony_ci uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ 107cabdff1aSopenharmony_ci uint32_t val_lw_m; \ 108cabdff1aSopenharmony_ci \ 109cabdff1aSopenharmony_ci __asm__ volatile ( \ 110cabdff1aSopenharmony_ci "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ 111cabdff1aSopenharmony_ci "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ 112cabdff1aSopenharmony_ci \ 113cabdff1aSopenharmony_ci : [val_lw_m] "=&r"(val_lw_m) \ 114cabdff1aSopenharmony_ci : [psrc_lw_m] "r"(psrc_lw_m) \ 115cabdff1aSopenharmony_ci ); \ 116cabdff1aSopenharmony_ci \ 117cabdff1aSopenharmony_ci val_lw_m; \ 118cabdff1aSopenharmony_ci } ) 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci #if (__mips == 64) 121cabdff1aSopenharmony_ci #define LD(psrc) \ 122cabdff1aSopenharmony_ci ( { \ 123cabdff1aSopenharmony_ci uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 124cabdff1aSopenharmony_ci uint64_t val_ld_m = 0; \ 125cabdff1aSopenharmony_ci \ 126cabdff1aSopenharmony_ci __asm__ volatile ( \ 127cabdff1aSopenharmony_ci "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ 128cabdff1aSopenharmony_ci "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ 129cabdff1aSopenharmony_ci \ 130cabdff1aSopenharmony_ci : [val_ld_m] "=&r" (val_ld_m) \ 131cabdff1aSopenharmony_ci : [psrc_ld_m] "r" (psrc_ld_m) \ 132cabdff1aSopenharmony_ci ); \ 133cabdff1aSopenharmony_ci \ 134cabdff1aSopenharmony_ci val_ld_m; \ 135cabdff1aSopenharmony_ci } ) 136cabdff1aSopenharmony_ci #else // !(__mips == 64) 137cabdff1aSopenharmony_ci #define LD(psrc) \ 138cabdff1aSopenharmony_ci ( { \ 139cabdff1aSopenharmony_ci uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ 140cabdff1aSopenharmony_ci uint32_t val0_ld_m, val1_ld_m; \ 141cabdff1aSopenharmony_ci uint64_t val_ld_m = 0; \ 142cabdff1aSopenharmony_ci \ 143cabdff1aSopenharmony_ci val0_ld_m = LW(psrc_ld_m); \ 144cabdff1aSopenharmony_ci val1_ld_m = LW(psrc_ld_m + 4); \ 145cabdff1aSopenharmony_ci \ 146cabdff1aSopenharmony_ci val_ld_m = (uint64_t) (val1_ld_m); \ 147cabdff1aSopenharmony_ci val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \ 148cabdff1aSopenharmony_ci val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \ 149cabdff1aSopenharmony_ci \ 150cabdff1aSopenharmony_ci val_ld_m; \ 151cabdff1aSopenharmony_ci } ) 152cabdff1aSopenharmony_ci #endif // (__mips == 64) 153cabdff1aSopenharmony_ci 154cabdff1aSopenharmony_ci #define SH(val, pdst) \ 155cabdff1aSopenharmony_ci { \ 156cabdff1aSopenharmony_ci uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ 157cabdff1aSopenharmony_ci uint16_t val_sh_m = (val); \ 158cabdff1aSopenharmony_ci \ 159cabdff1aSopenharmony_ci __asm__ volatile ( \ 160cabdff1aSopenharmony_ci "ush %[val_sh_m], %[pdst_sh_m] \n\t" \ 161cabdff1aSopenharmony_ci \ 162cabdff1aSopenharmony_ci : [pdst_sh_m] "=m" (*pdst_sh_m) \ 163cabdff1aSopenharmony_ci : [val_sh_m] "r" (val_sh_m) \ 164cabdff1aSopenharmony_ci ); \ 165cabdff1aSopenharmony_ci } 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci #define SW(val, pdst) \ 168cabdff1aSopenharmony_ci { \ 169cabdff1aSopenharmony_ci uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ 170cabdff1aSopenharmony_ci uint32_t val_sw_m = (val); \ 171cabdff1aSopenharmony_ci \ 172cabdff1aSopenharmony_ci __asm__ volatile ( \ 173cabdff1aSopenharmony_ci "usw %[val_sw_m], %[pdst_sw_m] \n\t" \ 174cabdff1aSopenharmony_ci \ 175cabdff1aSopenharmony_ci : [pdst_sw_m] "=m" (*pdst_sw_m) \ 176cabdff1aSopenharmony_ci : [val_sw_m] "r" (val_sw_m) \ 177cabdff1aSopenharmony_ci ); \ 178cabdff1aSopenharmony_ci } 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci #define SD(val, pdst) \ 181cabdff1aSopenharmony_ci { \ 182cabdff1aSopenharmony_ci uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ 183cabdff1aSopenharmony_ci uint32_t val0_sd_m, val1_sd_m; \ 184cabdff1aSopenharmony_ci \ 185cabdff1aSopenharmony_ci val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ 186cabdff1aSopenharmony_ci val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ 187cabdff1aSopenharmony_ci \ 188cabdff1aSopenharmony_ci SW(val0_sd_m, pdst_sd_m); \ 189cabdff1aSopenharmony_ci SW(val1_sd_m, pdst_sd_m + 4); \ 190cabdff1aSopenharmony_ci } 191cabdff1aSopenharmony_ci#endif // (__mips_isa_rev >= 6) 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci/* Description : Load 4 words with stride 194cabdff1aSopenharmony_ci Arguments : Inputs - psrc (source pointer to load from) 195cabdff1aSopenharmony_ci - stride 196cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 197cabdff1aSopenharmony_ci Details : Loads word in 'out0' from (psrc) 198cabdff1aSopenharmony_ci Loads word in 'out1' from (psrc + stride) 199cabdff1aSopenharmony_ci Loads word in 'out2' from (psrc + 2 * stride) 200cabdff1aSopenharmony_ci Loads word in 'out3' from (psrc + 3 * stride) 201cabdff1aSopenharmony_ci*/ 202cabdff1aSopenharmony_ci#define LW4(psrc, stride, out0, out1, out2, out3) \ 203cabdff1aSopenharmony_ci{ \ 204cabdff1aSopenharmony_ci out0 = LW((psrc)); \ 205cabdff1aSopenharmony_ci out1 = LW((psrc) + stride); \ 206cabdff1aSopenharmony_ci out2 = LW((psrc) + 2 * stride); \ 207cabdff1aSopenharmony_ci out3 = LW((psrc) + 3 * stride); \ 208cabdff1aSopenharmony_ci} 209cabdff1aSopenharmony_ci 210cabdff1aSopenharmony_ci#define LW2(psrc, stride, out0, out1) \ 211cabdff1aSopenharmony_ci{ \ 212cabdff1aSopenharmony_ci out0 = LW((psrc)); \ 213cabdff1aSopenharmony_ci out1 = LW((psrc) + stride); \ 214cabdff1aSopenharmony_ci} 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci/* Description : Load double words with stride 217cabdff1aSopenharmony_ci Arguments : Inputs - psrc (source pointer to load from) 218cabdff1aSopenharmony_ci - stride 219cabdff1aSopenharmony_ci Outputs - out0, out1 220cabdff1aSopenharmony_ci Details : Loads double word in 'out0' from (psrc) 221cabdff1aSopenharmony_ci Loads double word in 'out1' from (psrc + stride) 222cabdff1aSopenharmony_ci*/ 223cabdff1aSopenharmony_ci#define LD2(psrc, stride, out0, out1) \ 224cabdff1aSopenharmony_ci{ \ 225cabdff1aSopenharmony_ci out0 = LD((psrc)); \ 226cabdff1aSopenharmony_ci out1 = LD((psrc) + stride); \ 227cabdff1aSopenharmony_ci} 228cabdff1aSopenharmony_ci#define LD4(psrc, stride, out0, out1, out2, out3) \ 229cabdff1aSopenharmony_ci{ \ 230cabdff1aSopenharmony_ci LD2((psrc), stride, out0, out1); \ 231cabdff1aSopenharmony_ci LD2((psrc) + 2 * stride, stride, out2, out3); \ 232cabdff1aSopenharmony_ci} 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci/* Description : Store 4 words with stride 235cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, pdst, stride 236cabdff1aSopenharmony_ci Details : Stores word from 'in0' to (pdst) 237cabdff1aSopenharmony_ci Stores word from 'in1' to (pdst + stride) 238cabdff1aSopenharmony_ci Stores word from 'in2' to (pdst + 2 * stride) 239cabdff1aSopenharmony_ci Stores word from 'in3' to (pdst + 3 * stride) 240cabdff1aSopenharmony_ci*/ 241cabdff1aSopenharmony_ci#define SW4(in0, in1, in2, in3, pdst, stride) \ 242cabdff1aSopenharmony_ci{ \ 243cabdff1aSopenharmony_ci SW(in0, (pdst)) \ 244cabdff1aSopenharmony_ci SW(in1, (pdst) + stride); \ 245cabdff1aSopenharmony_ci SW(in2, (pdst) + 2 * stride); \ 246cabdff1aSopenharmony_ci SW(in3, (pdst) + 3 * stride); \ 247cabdff1aSopenharmony_ci} 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci/* Description : Store 4 double words with stride 250cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, pdst, stride 251cabdff1aSopenharmony_ci Details : Stores double word from 'in0' to (pdst) 252cabdff1aSopenharmony_ci Stores double word from 'in1' to (pdst + stride) 253cabdff1aSopenharmony_ci Stores double word from 'in2' to (pdst + 2 * stride) 254cabdff1aSopenharmony_ci Stores double word from 'in3' to (pdst + 3 * stride) 255cabdff1aSopenharmony_ci*/ 256cabdff1aSopenharmony_ci#define SD4(in0, in1, in2, in3, pdst, stride) \ 257cabdff1aSopenharmony_ci{ \ 258cabdff1aSopenharmony_ci SD(in0, (pdst)) \ 259cabdff1aSopenharmony_ci SD(in1, (pdst) + stride); \ 260cabdff1aSopenharmony_ci SD(in2, (pdst) + 2 * stride); \ 261cabdff1aSopenharmony_ci SD(in3, (pdst) + 3 * stride); \ 262cabdff1aSopenharmony_ci} 263cabdff1aSopenharmony_ci 264cabdff1aSopenharmony_ci/* Description : Load vector elements with stride 265cabdff1aSopenharmony_ci Arguments : Inputs - psrc (source pointer to load from) 266cabdff1aSopenharmony_ci - stride 267cabdff1aSopenharmony_ci Outputs - out0, out1 268cabdff1aSopenharmony_ci Return Type - as per RTYPE 269cabdff1aSopenharmony_ci Details : Loads elements in 'out0' from (psrc) 270cabdff1aSopenharmony_ci Loads elements in 'out1' from (psrc + stride) 271cabdff1aSopenharmony_ci*/ 272cabdff1aSopenharmony_ci#define LD_V2(RTYPE, psrc, stride, out0, out1) \ 273cabdff1aSopenharmony_ci{ \ 274cabdff1aSopenharmony_ci out0 = LD_V(RTYPE, (psrc)); \ 275cabdff1aSopenharmony_ci out1 = LD_V(RTYPE, (psrc) + stride); \ 276cabdff1aSopenharmony_ci} 277cabdff1aSopenharmony_ci#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) 278cabdff1aSopenharmony_ci#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) 279cabdff1aSopenharmony_ci#define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__) 280cabdff1aSopenharmony_ci#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) 281cabdff1aSopenharmony_ci#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) 282cabdff1aSopenharmony_ci 283cabdff1aSopenharmony_ci#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ 284cabdff1aSopenharmony_ci{ \ 285cabdff1aSopenharmony_ci LD_V2(RTYPE, (psrc), stride, out0, out1); \ 286cabdff1aSopenharmony_ci out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ 287cabdff1aSopenharmony_ci} 288cabdff1aSopenharmony_ci#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) 289cabdff1aSopenharmony_ci#define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__) 290cabdff1aSopenharmony_ci 291cabdff1aSopenharmony_ci#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 292cabdff1aSopenharmony_ci{ \ 293cabdff1aSopenharmony_ci LD_V2(RTYPE, (psrc), stride, out0, out1); \ 294cabdff1aSopenharmony_ci LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ 295cabdff1aSopenharmony_ci} 296cabdff1aSopenharmony_ci#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) 297cabdff1aSopenharmony_ci#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) 298cabdff1aSopenharmony_ci#define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__) 299cabdff1aSopenharmony_ci#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) 300cabdff1aSopenharmony_ci#define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__) 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ci#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 303cabdff1aSopenharmony_ci{ \ 304cabdff1aSopenharmony_ci LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 305cabdff1aSopenharmony_ci out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ 306cabdff1aSopenharmony_ci} 307cabdff1aSopenharmony_ci#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) 308cabdff1aSopenharmony_ci#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) 309cabdff1aSopenharmony_ci 310cabdff1aSopenharmony_ci#define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \ 311cabdff1aSopenharmony_ci{ \ 312cabdff1aSopenharmony_ci LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 313cabdff1aSopenharmony_ci LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \ 314cabdff1aSopenharmony_ci} 315cabdff1aSopenharmony_ci#define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__) 316cabdff1aSopenharmony_ci#define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__) 317cabdff1aSopenharmony_ci#define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__) 318cabdff1aSopenharmony_ci#define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__) 319cabdff1aSopenharmony_ci 320cabdff1aSopenharmony_ci#define LD_V7(RTYPE, psrc, stride, \ 321cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6) \ 322cabdff1aSopenharmony_ci{ \ 323cabdff1aSopenharmony_ci LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ 324cabdff1aSopenharmony_ci LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ 325cabdff1aSopenharmony_ci} 326cabdff1aSopenharmony_ci#define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__) 327cabdff1aSopenharmony_ci#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_ci#define LD_V8(RTYPE, psrc, stride, \ 330cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 331cabdff1aSopenharmony_ci{ \ 332cabdff1aSopenharmony_ci LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 333cabdff1aSopenharmony_ci LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 334cabdff1aSopenharmony_ci} 335cabdff1aSopenharmony_ci#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) 336cabdff1aSopenharmony_ci#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) 337cabdff1aSopenharmony_ci#define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__) 338cabdff1aSopenharmony_ci#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) 339cabdff1aSopenharmony_ci#define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__) 340cabdff1aSopenharmony_ci 341cabdff1aSopenharmony_ci#define LD_V16(RTYPE, psrc, stride, \ 342cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7, \ 343cabdff1aSopenharmony_ci out8, out9, out10, out11, out12, out13, out14, out15) \ 344cabdff1aSopenharmony_ci{ \ 345cabdff1aSopenharmony_ci LD_V8(RTYPE, (psrc), stride, \ 346cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7); \ 347cabdff1aSopenharmony_ci LD_V8(RTYPE, (psrc) + 8 * stride, stride, \ 348cabdff1aSopenharmony_ci out8, out9, out10, out11, out12, out13, out14, out15); \ 349cabdff1aSopenharmony_ci} 350cabdff1aSopenharmony_ci#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci/* Description : Store vectors with stride 353cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, stride 354cabdff1aSopenharmony_ci Outputs - pdst (destination pointer to store to) 355cabdff1aSopenharmony_ci Details : Stores elements from 'in0' to (pdst) 356cabdff1aSopenharmony_ci Stores elements from 'in1' to (pdst + stride) 357cabdff1aSopenharmony_ci*/ 358cabdff1aSopenharmony_ci#define ST_V2(RTYPE, in0, in1, pdst, stride) \ 359cabdff1aSopenharmony_ci{ \ 360cabdff1aSopenharmony_ci ST_V(RTYPE, in0, (pdst)); \ 361cabdff1aSopenharmony_ci ST_V(RTYPE, in1, (pdst) + stride); \ 362cabdff1aSopenharmony_ci} 363cabdff1aSopenharmony_ci#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) 364cabdff1aSopenharmony_ci#define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__) 365cabdff1aSopenharmony_ci#define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__) 366cabdff1aSopenharmony_ci#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) 367cabdff1aSopenharmony_ci#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_ci#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 370cabdff1aSopenharmony_ci{ \ 371cabdff1aSopenharmony_ci ST_V2(RTYPE, in0, in1, (pdst), stride); \ 372cabdff1aSopenharmony_ci ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 373cabdff1aSopenharmony_ci} 374cabdff1aSopenharmony_ci#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) 375cabdff1aSopenharmony_ci#define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__) 376cabdff1aSopenharmony_ci#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) 377cabdff1aSopenharmony_ci#define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__) 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_ci#define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ 380cabdff1aSopenharmony_ci{ \ 381cabdff1aSopenharmony_ci ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 382cabdff1aSopenharmony_ci ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \ 383cabdff1aSopenharmony_ci} 384cabdff1aSopenharmony_ci#define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__) 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 387cabdff1aSopenharmony_ci{ \ 388cabdff1aSopenharmony_ci ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 389cabdff1aSopenharmony_ci ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 390cabdff1aSopenharmony_ci} 391cabdff1aSopenharmony_ci#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) 392cabdff1aSopenharmony_ci#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) 393cabdff1aSopenharmony_ci#define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__) 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci/* Description : Store half word elements of vector with stride 396cabdff1aSopenharmony_ci * Arguments : Inputs - in source vector 397cabdff1aSopenharmony_ci * - pdst (destination pointer to store to) 398cabdff1aSopenharmony_ci * - stride 399cabdff1aSopenharmony_ci * Details : Stores half word 'idx0' from 'in' to (pdst) 400cabdff1aSopenharmony_ci * Stores half word 'idx1' from 'in' to (pdst + stride) 401cabdff1aSopenharmony_ci * Similar for other elements 402cabdff1aSopenharmony_ci */ 403cabdff1aSopenharmony_ci#define ST_H1(in, idx, pdst) \ 404cabdff1aSopenharmony_ci{ \ 405cabdff1aSopenharmony_ci uint16_t out0_m; \ 406cabdff1aSopenharmony_ci out0_m = __msa_copy_u_h((v8i16) in, idx); \ 407cabdff1aSopenharmony_ci SH(out0_m, (pdst)); \ 408cabdff1aSopenharmony_ci} 409cabdff1aSopenharmony_ci#define ST_H2(in, idx0, idx1, pdst, stride) \ 410cabdff1aSopenharmony_ci{ \ 411cabdff1aSopenharmony_ci uint16_t out0_m, out1_m; \ 412cabdff1aSopenharmony_ci out0_m = __msa_copy_u_h((v8i16) in, idx0); \ 413cabdff1aSopenharmony_ci out1_m = __msa_copy_u_h((v8i16) in, idx1); \ 414cabdff1aSopenharmony_ci SH(out0_m, (pdst)); \ 415cabdff1aSopenharmony_ci SH(out1_m, (pdst) + stride); \ 416cabdff1aSopenharmony_ci} 417cabdff1aSopenharmony_ci#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 418cabdff1aSopenharmony_ci{ \ 419cabdff1aSopenharmony_ci uint16_t out0_m, out1_m, out2_m, out3_m; \ 420cabdff1aSopenharmony_ci out0_m = __msa_copy_u_h((v8i16) in, idx0); \ 421cabdff1aSopenharmony_ci out1_m = __msa_copy_u_h((v8i16) in, idx1); \ 422cabdff1aSopenharmony_ci out2_m = __msa_copy_u_h((v8i16) in, idx2); \ 423cabdff1aSopenharmony_ci out3_m = __msa_copy_u_h((v8i16) in, idx3); \ 424cabdff1aSopenharmony_ci SH(out0_m, (pdst)); \ 425cabdff1aSopenharmony_ci SH(out1_m, (pdst) + stride); \ 426cabdff1aSopenharmony_ci SH(out2_m, (pdst) + 2 * stride); \ 427cabdff1aSopenharmony_ci SH(out3_m, (pdst) + 3 * stride); \ 428cabdff1aSopenharmony_ci} 429cabdff1aSopenharmony_ci#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \ 430cabdff1aSopenharmony_ci idx6, idx7, pdst, stride) \ 431cabdff1aSopenharmony_ci{ \ 432cabdff1aSopenharmony_ci ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 433cabdff1aSopenharmony_ci ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \ 434cabdff1aSopenharmony_ci} 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci/* Description : Store word elements of vector with stride 437cabdff1aSopenharmony_ci * Arguments : Inputs - in source vector 438cabdff1aSopenharmony_ci * - pdst (destination pointer to store to) 439cabdff1aSopenharmony_ci * - stride 440cabdff1aSopenharmony_ci * Details : Stores word 'idx0' from 'in' to (pdst) 441cabdff1aSopenharmony_ci * Stores word 'idx1' from 'in' to (pdst + stride) 442cabdff1aSopenharmony_ci * Similar for other elements 443cabdff1aSopenharmony_ci */ 444cabdff1aSopenharmony_ci#define ST_W1(in, idx, pdst) \ 445cabdff1aSopenharmony_ci{ \ 446cabdff1aSopenharmony_ci uint32_t out0_m; \ 447cabdff1aSopenharmony_ci out0_m = __msa_copy_u_w((v4i32) in, idx); \ 448cabdff1aSopenharmony_ci SW(out0_m, (pdst)); \ 449cabdff1aSopenharmony_ci} 450cabdff1aSopenharmony_ci#define ST_W2(in, idx0, idx1, pdst, stride) \ 451cabdff1aSopenharmony_ci{ \ 452cabdff1aSopenharmony_ci uint32_t out0_m, out1_m; \ 453cabdff1aSopenharmony_ci out0_m = __msa_copy_u_w((v4i32) in, idx0); \ 454cabdff1aSopenharmony_ci out1_m = __msa_copy_u_w((v4i32) in, idx1); \ 455cabdff1aSopenharmony_ci SW(out0_m, (pdst)); \ 456cabdff1aSopenharmony_ci SW(out1_m, (pdst) + stride); \ 457cabdff1aSopenharmony_ci} 458cabdff1aSopenharmony_ci#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \ 459cabdff1aSopenharmony_ci{ \ 460cabdff1aSopenharmony_ci uint32_t out0_m, out1_m, out2_m, out3_m; \ 461cabdff1aSopenharmony_ci out0_m = __msa_copy_u_w((v4i32) in, idx0); \ 462cabdff1aSopenharmony_ci out1_m = __msa_copy_u_w((v4i32) in, idx1); \ 463cabdff1aSopenharmony_ci out2_m = __msa_copy_u_w((v4i32) in, idx2); \ 464cabdff1aSopenharmony_ci out3_m = __msa_copy_u_w((v4i32) in, idx3); \ 465cabdff1aSopenharmony_ci SW(out0_m, (pdst)); \ 466cabdff1aSopenharmony_ci SW(out1_m, (pdst) + stride); \ 467cabdff1aSopenharmony_ci SW(out2_m, (pdst) + 2*stride); \ 468cabdff1aSopenharmony_ci SW(out3_m, (pdst) + 3*stride); \ 469cabdff1aSopenharmony_ci} 470cabdff1aSopenharmony_ci#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \ 471cabdff1aSopenharmony_ci idx4, idx5, idx6, idx7, pdst, stride) \ 472cabdff1aSopenharmony_ci{ \ 473cabdff1aSopenharmony_ci ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \ 474cabdff1aSopenharmony_ci ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \ 475cabdff1aSopenharmony_ci} 476cabdff1aSopenharmony_ci 477cabdff1aSopenharmony_ci/* Description : Store double word elements of vector with stride 478cabdff1aSopenharmony_ci * Arguments : Inputs - in source vector 479cabdff1aSopenharmony_ci * - pdst (destination pointer to store to) 480cabdff1aSopenharmony_ci * - stride 481cabdff1aSopenharmony_ci * Details : Stores double word 'idx0' from 'in' to (pdst) 482cabdff1aSopenharmony_ci * Stores double word 'idx1' from 'in' to (pdst + stride) 483cabdff1aSopenharmony_ci * Similar for other elements 484cabdff1aSopenharmony_ci */ 485cabdff1aSopenharmony_ci#define ST_D1(in, idx, pdst) \ 486cabdff1aSopenharmony_ci{ \ 487cabdff1aSopenharmony_ci uint64_t out0_m; \ 488cabdff1aSopenharmony_ci out0_m = __msa_copy_u_d((v2i64) in, idx); \ 489cabdff1aSopenharmony_ci SD(out0_m, (pdst)); \ 490cabdff1aSopenharmony_ci} 491cabdff1aSopenharmony_ci#define ST_D2(in, idx0, idx1, pdst, stride) \ 492cabdff1aSopenharmony_ci{ \ 493cabdff1aSopenharmony_ci uint64_t out0_m, out1_m; \ 494cabdff1aSopenharmony_ci out0_m = __msa_copy_u_d((v2i64) in, idx0); \ 495cabdff1aSopenharmony_ci out1_m = __msa_copy_u_d((v2i64) in, idx1); \ 496cabdff1aSopenharmony_ci SD(out0_m, (pdst)); \ 497cabdff1aSopenharmony_ci SD(out1_m, (pdst) + stride); \ 498cabdff1aSopenharmony_ci} 499cabdff1aSopenharmony_ci#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 500cabdff1aSopenharmony_ci{ \ 501cabdff1aSopenharmony_ci uint64_t out0_m, out1_m, out2_m, out3_m; \ 502cabdff1aSopenharmony_ci out0_m = __msa_copy_u_d((v2i64) in0, idx0); \ 503cabdff1aSopenharmony_ci out1_m = __msa_copy_u_d((v2i64) in0, idx1); \ 504cabdff1aSopenharmony_ci out2_m = __msa_copy_u_d((v2i64) in1, idx2); \ 505cabdff1aSopenharmony_ci out3_m = __msa_copy_u_d((v2i64) in1, idx3); \ 506cabdff1aSopenharmony_ci SD(out0_m, (pdst)); \ 507cabdff1aSopenharmony_ci SD(out1_m, (pdst) + stride); \ 508cabdff1aSopenharmony_ci SD(out2_m, (pdst) + 2 * stride); \ 509cabdff1aSopenharmony_ci SD(out3_m, (pdst) + 3 * stride); \ 510cabdff1aSopenharmony_ci} 511cabdff1aSopenharmony_ci#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \ 512cabdff1aSopenharmony_ci idx4, idx5, idx6, idx7, pdst, stride) \ 513cabdff1aSopenharmony_ci{ \ 514cabdff1aSopenharmony_ci ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 515cabdff1aSopenharmony_ci ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \ 516cabdff1aSopenharmony_ci} 517cabdff1aSopenharmony_ci 518cabdff1aSopenharmony_ci/* Description : Store as 12x8 byte block to destination memory from 519cabdff1aSopenharmony_ci input vectors 520cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 521cabdff1aSopenharmony_ci Details : Index 0 double word element from input vector 'in0' is copied 522cabdff1aSopenharmony_ci and stored to destination memory at (pblk_12x8_m) followed by 523cabdff1aSopenharmony_ci index 2 word element from same input vector 'in0' at 524cabdff1aSopenharmony_ci (pblk_12x8_m + 8) 525cabdff1aSopenharmony_ci Similar to remaining lines 526cabdff1aSopenharmony_ci*/ 527cabdff1aSopenharmony_ci#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 528cabdff1aSopenharmony_ci{ \ 529cabdff1aSopenharmony_ci uint64_t out0_m, out1_m, out2_m, out3_m; \ 530cabdff1aSopenharmony_ci uint64_t out4_m, out5_m, out6_m, out7_m; \ 531cabdff1aSopenharmony_ci uint32_t out8_m, out9_m, out10_m, out11_m; \ 532cabdff1aSopenharmony_ci uint32_t out12_m, out13_m, out14_m, out15_m; \ 533cabdff1aSopenharmony_ci uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \ 534cabdff1aSopenharmony_ci \ 535cabdff1aSopenharmony_ci out0_m = __msa_copy_u_d((v2i64) in0, 0); \ 536cabdff1aSopenharmony_ci out1_m = __msa_copy_u_d((v2i64) in1, 0); \ 537cabdff1aSopenharmony_ci out2_m = __msa_copy_u_d((v2i64) in2, 0); \ 538cabdff1aSopenharmony_ci out3_m = __msa_copy_u_d((v2i64) in3, 0); \ 539cabdff1aSopenharmony_ci out4_m = __msa_copy_u_d((v2i64) in4, 0); \ 540cabdff1aSopenharmony_ci out5_m = __msa_copy_u_d((v2i64) in5, 0); \ 541cabdff1aSopenharmony_ci out6_m = __msa_copy_u_d((v2i64) in6, 0); \ 542cabdff1aSopenharmony_ci out7_m = __msa_copy_u_d((v2i64) in7, 0); \ 543cabdff1aSopenharmony_ci \ 544cabdff1aSopenharmony_ci out8_m = __msa_copy_u_w((v4i32) in0, 2); \ 545cabdff1aSopenharmony_ci out9_m = __msa_copy_u_w((v4i32) in1, 2); \ 546cabdff1aSopenharmony_ci out10_m = __msa_copy_u_w((v4i32) in2, 2); \ 547cabdff1aSopenharmony_ci out11_m = __msa_copy_u_w((v4i32) in3, 2); \ 548cabdff1aSopenharmony_ci out12_m = __msa_copy_u_w((v4i32) in4, 2); \ 549cabdff1aSopenharmony_ci out13_m = __msa_copy_u_w((v4i32) in5, 2); \ 550cabdff1aSopenharmony_ci out14_m = __msa_copy_u_w((v4i32) in6, 2); \ 551cabdff1aSopenharmony_ci out15_m = __msa_copy_u_w((v4i32) in7, 2); \ 552cabdff1aSopenharmony_ci \ 553cabdff1aSopenharmony_ci SD(out0_m, pblk_12x8_m); \ 554cabdff1aSopenharmony_ci SW(out8_m, pblk_12x8_m + 8); \ 555cabdff1aSopenharmony_ci pblk_12x8_m += stride; \ 556cabdff1aSopenharmony_ci SD(out1_m, pblk_12x8_m); \ 557cabdff1aSopenharmony_ci SW(out9_m, pblk_12x8_m + 8); \ 558cabdff1aSopenharmony_ci pblk_12x8_m += stride; \ 559cabdff1aSopenharmony_ci SD(out2_m, pblk_12x8_m); \ 560cabdff1aSopenharmony_ci SW(out10_m, pblk_12x8_m + 8); \ 561cabdff1aSopenharmony_ci pblk_12x8_m += stride; \ 562cabdff1aSopenharmony_ci SD(out3_m, pblk_12x8_m); \ 563cabdff1aSopenharmony_ci SW(out11_m, pblk_12x8_m + 8); \ 564cabdff1aSopenharmony_ci pblk_12x8_m += stride; \ 565cabdff1aSopenharmony_ci SD(out4_m, pblk_12x8_m); \ 566cabdff1aSopenharmony_ci SW(out12_m, pblk_12x8_m + 8); \ 567cabdff1aSopenharmony_ci pblk_12x8_m += stride; \ 568cabdff1aSopenharmony_ci SD(out5_m, pblk_12x8_m); \ 569cabdff1aSopenharmony_ci SW(out13_m, pblk_12x8_m + 8); \ 570cabdff1aSopenharmony_ci pblk_12x8_m += stride; \ 571cabdff1aSopenharmony_ci SD(out6_m, pblk_12x8_m); \ 572cabdff1aSopenharmony_ci SW(out14_m, pblk_12x8_m + 8); \ 573cabdff1aSopenharmony_ci pblk_12x8_m += stride; \ 574cabdff1aSopenharmony_ci SD(out7_m, pblk_12x8_m); \ 575cabdff1aSopenharmony_ci SW(out15_m, pblk_12x8_m + 8); \ 576cabdff1aSopenharmony_ci} 577cabdff1aSopenharmony_ci 578cabdff1aSopenharmony_ci/* Description : average with rounding (in0 + in1 + 1) / 2. 579cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, 580cabdff1aSopenharmony_ci Outputs - out0, out1 581cabdff1aSopenharmony_ci Return Type - as per RTYPE 582cabdff1aSopenharmony_ci Details : Each byte element from 'in0' vector is added with each byte 583cabdff1aSopenharmony_ci element from 'in1' vector. The addition of the elements plus 1 584cabdff1aSopenharmony_ci (for rounding) is done unsigned with full precision, 585cabdff1aSopenharmony_ci i.e. the result has one extra bit. Unsigned division by 2 586cabdff1aSopenharmony_ci (or logical shift right by one bit) is performed before writing 587cabdff1aSopenharmony_ci the result to vector 'out0' 588cabdff1aSopenharmony_ci Similar for the pair of 'in2' and 'in3' 589cabdff1aSopenharmony_ci*/ 590cabdff1aSopenharmony_ci#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 591cabdff1aSopenharmony_ci{ \ 592cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \ 593cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \ 594cabdff1aSopenharmony_ci} 595cabdff1aSopenharmony_ci#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 596cabdff1aSopenharmony_ci 597cabdff1aSopenharmony_ci#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 598cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 599cabdff1aSopenharmony_ci{ \ 600cabdff1aSopenharmony_ci AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 601cabdff1aSopenharmony_ci AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ 602cabdff1aSopenharmony_ci} 603cabdff1aSopenharmony_ci#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 604cabdff1aSopenharmony_ci 605cabdff1aSopenharmony_ci/* Description : Immediate number of columns to slide 606cabdff1aSopenharmony_ci Arguments : Inputs - s, d, slide_val 607cabdff1aSopenharmony_ci Outputs - out 608cabdff1aSopenharmony_ci Return Type - as per RTYPE 609cabdff1aSopenharmony_ci Details : Byte elements from 'd' vector are slide into 's' by 610cabdff1aSopenharmony_ci number of elements specified by 'slide_val' 611cabdff1aSopenharmony_ci*/ 612cabdff1aSopenharmony_ci#define SLDI_B(RTYPE, d, s, slide_val, out) \ 613cabdff1aSopenharmony_ci{ \ 614cabdff1aSopenharmony_ci out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val); \ 615cabdff1aSopenharmony_ci} 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_ci#define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \ 618cabdff1aSopenharmony_ci{ \ 619cabdff1aSopenharmony_ci SLDI_B(RTYPE, d0, s0, slide_val, out0) \ 620cabdff1aSopenharmony_ci SLDI_B(RTYPE, d1, s1, slide_val, out1) \ 621cabdff1aSopenharmony_ci} 622cabdff1aSopenharmony_ci#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 623cabdff1aSopenharmony_ci#define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__) 624cabdff1aSopenharmony_ci#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 625cabdff1aSopenharmony_ci#define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__) 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci#define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val, \ 628cabdff1aSopenharmony_ci out0, out1, out2) \ 629cabdff1aSopenharmony_ci{ \ 630cabdff1aSopenharmony_ci SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \ 631cabdff1aSopenharmony_ci SLDI_B(RTYPE, d2, s2, slide_val, out2) \ 632cabdff1aSopenharmony_ci} 633cabdff1aSopenharmony_ci#define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__) 634cabdff1aSopenharmony_ci#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 635cabdff1aSopenharmony_ci#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 636cabdff1aSopenharmony_ci 637cabdff1aSopenharmony_ci#define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3, \ 638cabdff1aSopenharmony_ci slide_val, out0, out1, out2, out3) \ 639cabdff1aSopenharmony_ci{ \ 640cabdff1aSopenharmony_ci SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \ 641cabdff1aSopenharmony_ci SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3) \ 642cabdff1aSopenharmony_ci} 643cabdff1aSopenharmony_ci#define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__) 644cabdff1aSopenharmony_ci#define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__) 645cabdff1aSopenharmony_ci#define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__) 646cabdff1aSopenharmony_ci 647cabdff1aSopenharmony_ci/* Description : Shuffle byte vector elements as per mask vector 648cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 649cabdff1aSopenharmony_ci Outputs - out0, out1 650cabdff1aSopenharmony_ci Return Type - as per RTYPE 651cabdff1aSopenharmony_ci Details : Selective byte elements from in0 & in1 are copied to out0 as 652cabdff1aSopenharmony_ci per control vector mask0 653cabdff1aSopenharmony_ci Selective byte elements from in2 & in3 are copied to out1 as 654cabdff1aSopenharmony_ci per control vector mask1 655cabdff1aSopenharmony_ci*/ 656cabdff1aSopenharmony_ci#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 657cabdff1aSopenharmony_ci{ \ 658cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ 659cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ 660cabdff1aSopenharmony_ci} 661cabdff1aSopenharmony_ci#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 662cabdff1aSopenharmony_ci#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 663cabdff1aSopenharmony_ci#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 664cabdff1aSopenharmony_ci#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) 665cabdff1aSopenharmony_ci 666cabdff1aSopenharmony_ci#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \ 667cabdff1aSopenharmony_ci out0, out1, out2) \ 668cabdff1aSopenharmony_ci{ \ 669cabdff1aSopenharmony_ci VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 670cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \ 671cabdff1aSopenharmony_ci} 672cabdff1aSopenharmony_ci#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__) 673cabdff1aSopenharmony_ci 674cabdff1aSopenharmony_ci#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ 675cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 676cabdff1aSopenharmony_ci{ \ 677cabdff1aSopenharmony_ci VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ 678cabdff1aSopenharmony_ci VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ 679cabdff1aSopenharmony_ci} 680cabdff1aSopenharmony_ci#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 681cabdff1aSopenharmony_ci#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 682cabdff1aSopenharmony_ci 683cabdff1aSopenharmony_ci/* Description : Shuffle halfword vector elements as per mask vector 684cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 685cabdff1aSopenharmony_ci Outputs - out0, out1 686cabdff1aSopenharmony_ci Return Type - as per RTYPE 687cabdff1aSopenharmony_ci Details : Selective halfword elements from in0 & in1 are copied to out0 688cabdff1aSopenharmony_ci as per control vector mask0 689cabdff1aSopenharmony_ci Selective halfword elements from in2 & in3 are copied to out1 690cabdff1aSopenharmony_ci as per control vector mask1 691cabdff1aSopenharmony_ci*/ 692cabdff1aSopenharmony_ci#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 693cabdff1aSopenharmony_ci{ \ 694cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \ 695cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \ 696cabdff1aSopenharmony_ci} 697cabdff1aSopenharmony_ci#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) 698cabdff1aSopenharmony_ci 699cabdff1aSopenharmony_ci#define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \ 700cabdff1aSopenharmony_ci out0, out1, out2) \ 701cabdff1aSopenharmony_ci{ \ 702cabdff1aSopenharmony_ci VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ 703cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \ 704cabdff1aSopenharmony_ci} 705cabdff1aSopenharmony_ci#define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__) 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci/* Description : Shuffle byte vector elements as per mask vector 708cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 709cabdff1aSopenharmony_ci Outputs - out0, out1 710cabdff1aSopenharmony_ci Return Type - as per RTYPE 711cabdff1aSopenharmony_ci Details : Selective byte elements from in0 & in1 are copied to out0 as 712cabdff1aSopenharmony_ci per control vector mask0 713cabdff1aSopenharmony_ci Selective byte elements from in2 & in3 are copied to out1 as 714cabdff1aSopenharmony_ci per control vector mask1 715cabdff1aSopenharmony_ci*/ 716cabdff1aSopenharmony_ci#define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 717cabdff1aSopenharmony_ci{ \ 718cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \ 719cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \ 720cabdff1aSopenharmony_ci} 721cabdff1aSopenharmony_ci#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__) 722cabdff1aSopenharmony_ci 723cabdff1aSopenharmony_ci/* Description : Dot product of byte vector elements 724cabdff1aSopenharmony_ci Arguments : Inputs - mult0, mult1 725cabdff1aSopenharmony_ci cnst0, cnst1 726cabdff1aSopenharmony_ci Outputs - out0, out1 727cabdff1aSopenharmony_ci Return Type - as per RTYPE 728cabdff1aSopenharmony_ci Details : Unsigned byte elements from mult0 are multiplied with 729cabdff1aSopenharmony_ci unsigned byte elements from cnst0 producing a result 730cabdff1aSopenharmony_ci twice the size of input i.e. unsigned halfword. 731cabdff1aSopenharmony_ci Then this multiplication results of adjacent odd-even elements 732cabdff1aSopenharmony_ci are added together and stored to the out vector 733cabdff1aSopenharmony_ci (2 unsigned halfword results) 734cabdff1aSopenharmony_ci*/ 735cabdff1aSopenharmony_ci#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 736cabdff1aSopenharmony_ci{ \ 737cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \ 738cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \ 739cabdff1aSopenharmony_ci} 740cabdff1aSopenharmony_ci#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 741cabdff1aSopenharmony_ci 742cabdff1aSopenharmony_ci#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ 743cabdff1aSopenharmony_ci cnst0, cnst1, cnst2, cnst3, \ 744cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 745cabdff1aSopenharmony_ci{ \ 746cabdff1aSopenharmony_ci DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 747cabdff1aSopenharmony_ci DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 748cabdff1aSopenharmony_ci} 749cabdff1aSopenharmony_ci#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 750cabdff1aSopenharmony_ci 751cabdff1aSopenharmony_ci/* Description : Dot product of byte vector elements 752cabdff1aSopenharmony_ci Arguments : Inputs - mult0, mult1 753cabdff1aSopenharmony_ci cnst0, cnst1 754cabdff1aSopenharmony_ci Outputs - out0, out1 755cabdff1aSopenharmony_ci Return Type - as per RTYPE 756cabdff1aSopenharmony_ci Details : Signed byte elements from mult0 are multiplied with 757cabdff1aSopenharmony_ci signed byte elements from cnst0 producing a result 758cabdff1aSopenharmony_ci twice the size of input i.e. signed halfword. 759cabdff1aSopenharmony_ci Then this multiplication results of adjacent odd-even elements 760cabdff1aSopenharmony_ci are added together and stored to the out vector 761cabdff1aSopenharmony_ci (2 signed halfword results) 762cabdff1aSopenharmony_ci*/ 763cabdff1aSopenharmony_ci#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 764cabdff1aSopenharmony_ci{ \ 765cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \ 766cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \ 767cabdff1aSopenharmony_ci} 768cabdff1aSopenharmony_ci#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 769cabdff1aSopenharmony_ci 770cabdff1aSopenharmony_ci#define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \ 771cabdff1aSopenharmony_ci out0, out1, out2) \ 772cabdff1aSopenharmony_ci{ \ 773cabdff1aSopenharmony_ci DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 774cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \ 775cabdff1aSopenharmony_ci} 776cabdff1aSopenharmony_ci#define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__) 777cabdff1aSopenharmony_ci 778cabdff1aSopenharmony_ci#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 779cabdff1aSopenharmony_ci cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 780cabdff1aSopenharmony_ci{ \ 781cabdff1aSopenharmony_ci DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 782cabdff1aSopenharmony_ci DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 783cabdff1aSopenharmony_ci} 784cabdff1aSopenharmony_ci#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci/* Description : Dot product of halfword vector elements 787cabdff1aSopenharmony_ci Arguments : Inputs - mult0, mult1 788cabdff1aSopenharmony_ci cnst0, cnst1 789cabdff1aSopenharmony_ci Outputs - out0, out1 790cabdff1aSopenharmony_ci Return Type - as per RTYPE 791cabdff1aSopenharmony_ci Details : Signed halfword elements from mult0 are multiplied with 792cabdff1aSopenharmony_ci signed halfword elements from cnst0 producing a result 793cabdff1aSopenharmony_ci twice the size of input i.e. signed word. 794cabdff1aSopenharmony_ci Then this multiplication results of adjacent odd-even elements 795cabdff1aSopenharmony_ci are added together and stored to the out vector 796cabdff1aSopenharmony_ci (2 signed word results) 797cabdff1aSopenharmony_ci*/ 798cabdff1aSopenharmony_ci#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 799cabdff1aSopenharmony_ci{ \ 800cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \ 801cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \ 802cabdff1aSopenharmony_ci} 803cabdff1aSopenharmony_ci#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 806cabdff1aSopenharmony_ci cnst0, cnst1, cnst2, cnst3, \ 807cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 808cabdff1aSopenharmony_ci{ \ 809cabdff1aSopenharmony_ci DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 810cabdff1aSopenharmony_ci DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 811cabdff1aSopenharmony_ci} 812cabdff1aSopenharmony_ci#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_ci/* Description : Dot product & addition of byte vector elements 815cabdff1aSopenharmony_ci Arguments : Inputs - mult0, mult1 816cabdff1aSopenharmony_ci cnst0, cnst1 817cabdff1aSopenharmony_ci Outputs - out0, out1 818cabdff1aSopenharmony_ci Return Type - as per RTYPE 819cabdff1aSopenharmony_ci Details : Signed byte elements from mult0 are multiplied with 820cabdff1aSopenharmony_ci signed byte elements from cnst0 producing a result 821cabdff1aSopenharmony_ci twice the size of input i.e. signed halfword. 822cabdff1aSopenharmony_ci Then this multiplication results of adjacent odd-even elements 823cabdff1aSopenharmony_ci are added to the out vector 824cabdff1aSopenharmony_ci (2 signed halfword results) 825cabdff1aSopenharmony_ci*/ 826cabdff1aSopenharmony_ci#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 827cabdff1aSopenharmony_ci{ \ 828cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \ 829cabdff1aSopenharmony_ci (v16i8) mult0, (v16i8) cnst0); \ 830cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \ 831cabdff1aSopenharmony_ci (v16i8) mult1, (v16i8) cnst1); \ 832cabdff1aSopenharmony_ci} 833cabdff1aSopenharmony_ci#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 834cabdff1aSopenharmony_ci 835cabdff1aSopenharmony_ci#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ 836cabdff1aSopenharmony_ci cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 837cabdff1aSopenharmony_ci{ \ 838cabdff1aSopenharmony_ci DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 839cabdff1aSopenharmony_ci DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 840cabdff1aSopenharmony_ci} 841cabdff1aSopenharmony_ci#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 842cabdff1aSopenharmony_ci 843cabdff1aSopenharmony_ci/* Description : Dot product & addition of byte vector elements 844cabdff1aSopenharmony_ci Arguments : Inputs - mult0, mult1 845cabdff1aSopenharmony_ci cnst0, cnst1 846cabdff1aSopenharmony_ci Outputs - out0, out1 847cabdff1aSopenharmony_ci Return Type - as per RTYPE 848cabdff1aSopenharmony_ci Details : Unsigned byte elements from mult0 are multiplied with 849cabdff1aSopenharmony_ci unsigned byte elements from cnst0 producing a result 850cabdff1aSopenharmony_ci twice the size of input i.e. unsigned halfword. 851cabdff1aSopenharmony_ci Then this multiplication results of adjacent odd-even elements 852cabdff1aSopenharmony_ci are added to the out vector 853cabdff1aSopenharmony_ci (2 unsigned halfword results) 854cabdff1aSopenharmony_ci*/ 855cabdff1aSopenharmony_ci#define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 856cabdff1aSopenharmony_ci{ \ 857cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \ 858cabdff1aSopenharmony_ci (v16u8) mult0, (v16u8) cnst0); \ 859cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \ 860cabdff1aSopenharmony_ci (v16u8) mult1, (v16u8) cnst1); \ 861cabdff1aSopenharmony_ci} 862cabdff1aSopenharmony_ci#define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__) 863cabdff1aSopenharmony_ci 864cabdff1aSopenharmony_ci/* Description : Dot product & addition of halfword vector elements 865cabdff1aSopenharmony_ci Arguments : Inputs - mult0, mult1 866cabdff1aSopenharmony_ci cnst0, cnst1 867cabdff1aSopenharmony_ci Outputs - out0, out1 868cabdff1aSopenharmony_ci Return Type - as per RTYPE 869cabdff1aSopenharmony_ci Details : Signed halfword elements from mult0 are multiplied with 870cabdff1aSopenharmony_ci signed halfword elements from cnst0 producing a result 871cabdff1aSopenharmony_ci twice the size of input i.e. signed word. 872cabdff1aSopenharmony_ci Then this multiplication results of adjacent odd-even elements 873cabdff1aSopenharmony_ci are added to the out vector 874cabdff1aSopenharmony_ci (2 signed word results) 875cabdff1aSopenharmony_ci*/ 876cabdff1aSopenharmony_ci#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 877cabdff1aSopenharmony_ci{ \ 878cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \ 879cabdff1aSopenharmony_ci (v8i16) mult0, (v8i16) cnst0); \ 880cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \ 881cabdff1aSopenharmony_ci (v8i16) mult1, (v8i16) cnst1); \ 882cabdff1aSopenharmony_ci} 883cabdff1aSopenharmony_ci#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 884cabdff1aSopenharmony_ci 885cabdff1aSopenharmony_ci#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \ 886cabdff1aSopenharmony_ci cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ 887cabdff1aSopenharmony_ci{ \ 888cabdff1aSopenharmony_ci DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 889cabdff1aSopenharmony_ci DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 890cabdff1aSopenharmony_ci} 891cabdff1aSopenharmony_ci#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__) 892cabdff1aSopenharmony_ci 893cabdff1aSopenharmony_ci/* Description : Minimum values between unsigned elements of 894cabdff1aSopenharmony_ci either vector are copied to the output vector 895cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, min_vec 896cabdff1aSopenharmony_ci Outputs - in0, in1, (in place) 897cabdff1aSopenharmony_ci Return Type - as per RTYPE 898cabdff1aSopenharmony_ci Details : Minimum of unsigned halfword element values from 'in0' and 899cabdff1aSopenharmony_ci 'min_value' are written to output vector 'in0' 900cabdff1aSopenharmony_ci*/ 901cabdff1aSopenharmony_ci#define MIN_UH2(RTYPE, in0, in1, min_vec) \ 902cabdff1aSopenharmony_ci{ \ 903cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \ 904cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \ 905cabdff1aSopenharmony_ci} 906cabdff1aSopenharmony_ci#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 907cabdff1aSopenharmony_ci 908cabdff1aSopenharmony_ci#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ 909cabdff1aSopenharmony_ci{ \ 910cabdff1aSopenharmony_ci MIN_UH2(RTYPE, in0, in1, min_vec); \ 911cabdff1aSopenharmony_ci MIN_UH2(RTYPE, in2, in3, min_vec); \ 912cabdff1aSopenharmony_ci} 913cabdff1aSopenharmony_ci#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 914cabdff1aSopenharmony_ci 915cabdff1aSopenharmony_ci/* Description : Clips all halfword elements of input vector between min & max 916cabdff1aSopenharmony_ci out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) 917cabdff1aSopenharmony_ci Arguments : Inputs - in (input vector) 918cabdff1aSopenharmony_ci - min (min threshold) 919cabdff1aSopenharmony_ci - max (max threshold) 920cabdff1aSopenharmony_ci Outputs - in (output vector with clipped elements) 921cabdff1aSopenharmony_ci Return Type - signed halfword 922cabdff1aSopenharmony_ci*/ 923cabdff1aSopenharmony_ci#define CLIP_SH(in, min, max) \ 924cabdff1aSopenharmony_ci{ \ 925cabdff1aSopenharmony_ci in = __msa_max_s_h((v8i16) min, (v8i16) in); \ 926cabdff1aSopenharmony_ci in = __msa_min_s_h((v8i16) max, (v8i16) in); \ 927cabdff1aSopenharmony_ci} 928cabdff1aSopenharmony_ci 929cabdff1aSopenharmony_ci/* Description : Clips all signed halfword elements of input vector 930cabdff1aSopenharmony_ci between 0 & 255 931cabdff1aSopenharmony_ci Arguments : Inputs - in (input vector) 932cabdff1aSopenharmony_ci Outputs - in (output vector with clipped elements) 933cabdff1aSopenharmony_ci Return Type - signed halfwords 934cabdff1aSopenharmony_ci*/ 935cabdff1aSopenharmony_ci#define CLIP_SH_0_255(in) \ 936cabdff1aSopenharmony_ci{ \ 937cabdff1aSopenharmony_ci in = __msa_maxi_s_h((v8i16) in, 0); \ 938cabdff1aSopenharmony_ci in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \ 939cabdff1aSopenharmony_ci} 940cabdff1aSopenharmony_ci 941cabdff1aSopenharmony_ci#define CLIP_SH2_0_255(in0, in1) \ 942cabdff1aSopenharmony_ci{ \ 943cabdff1aSopenharmony_ci CLIP_SH_0_255(in0); \ 944cabdff1aSopenharmony_ci CLIP_SH_0_255(in1); \ 945cabdff1aSopenharmony_ci} 946cabdff1aSopenharmony_ci 947cabdff1aSopenharmony_ci#define CLIP_SH4_0_255(in0, in1, in2, in3) \ 948cabdff1aSopenharmony_ci{ \ 949cabdff1aSopenharmony_ci CLIP_SH2_0_255(in0, in1); \ 950cabdff1aSopenharmony_ci CLIP_SH2_0_255(in2, in3); \ 951cabdff1aSopenharmony_ci} 952cabdff1aSopenharmony_ci 953cabdff1aSopenharmony_ci#define CLIP_SH8_0_255(in0, in1, in2, in3, \ 954cabdff1aSopenharmony_ci in4, in5, in6, in7) \ 955cabdff1aSopenharmony_ci{ \ 956cabdff1aSopenharmony_ci CLIP_SH4_0_255(in0, in1, in2, in3); \ 957cabdff1aSopenharmony_ci CLIP_SH4_0_255(in4, in5, in6, in7); \ 958cabdff1aSopenharmony_ci} 959cabdff1aSopenharmony_ci 960cabdff1aSopenharmony_ci/* Description : Clips all signed word elements of input vector 961cabdff1aSopenharmony_ci between 0 & 255 962cabdff1aSopenharmony_ci Arguments : Inputs - in (input vector) 963cabdff1aSopenharmony_ci Outputs - in (output vector with clipped elements) 964cabdff1aSopenharmony_ci Return Type - signed word 965cabdff1aSopenharmony_ci*/ 966cabdff1aSopenharmony_ci#define CLIP_SW_0_255(in) \ 967cabdff1aSopenharmony_ci{ \ 968cabdff1aSopenharmony_ci in = __msa_maxi_s_w((v4i32) in, 0); \ 969cabdff1aSopenharmony_ci in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \ 970cabdff1aSopenharmony_ci} 971cabdff1aSopenharmony_ci 972cabdff1aSopenharmony_ci#define CLIP_SW2_0_255(in0, in1) \ 973cabdff1aSopenharmony_ci{ \ 974cabdff1aSopenharmony_ci CLIP_SW_0_255(in0); \ 975cabdff1aSopenharmony_ci CLIP_SW_0_255(in1); \ 976cabdff1aSopenharmony_ci} 977cabdff1aSopenharmony_ci 978cabdff1aSopenharmony_ci#define CLIP_SW4_0_255(in0, in1, in2, in3) \ 979cabdff1aSopenharmony_ci{ \ 980cabdff1aSopenharmony_ci CLIP_SW2_0_255(in0, in1); \ 981cabdff1aSopenharmony_ci CLIP_SW2_0_255(in2, in3); \ 982cabdff1aSopenharmony_ci} 983cabdff1aSopenharmony_ci 984cabdff1aSopenharmony_ci#define CLIP_SW8_0_255(in0, in1, in2, in3, \ 985cabdff1aSopenharmony_ci in4, in5, in6, in7) \ 986cabdff1aSopenharmony_ci{ \ 987cabdff1aSopenharmony_ci CLIP_SW4_0_255(in0, in1, in2, in3); \ 988cabdff1aSopenharmony_ci CLIP_SW4_0_255(in4, in5, in6, in7); \ 989cabdff1aSopenharmony_ci} 990cabdff1aSopenharmony_ci 991cabdff1aSopenharmony_ci/* Description : Addition of 4 signed word elements 992cabdff1aSopenharmony_ci 4 signed word elements of input vector are added together and 993cabdff1aSopenharmony_ci resulted integer sum is returned 994cabdff1aSopenharmony_ci Arguments : Inputs - in (signed word vector) 995cabdff1aSopenharmony_ci Outputs - sum_m (i32 sum) 996cabdff1aSopenharmony_ci Return Type - signed word 997cabdff1aSopenharmony_ci*/ 998cabdff1aSopenharmony_ci#define HADD_SW_S32(in) \ 999cabdff1aSopenharmony_ci( { \ 1000cabdff1aSopenharmony_ci v2i64 res0_m, res1_m; \ 1001cabdff1aSopenharmony_ci int32_t sum_m; \ 1002cabdff1aSopenharmony_ci \ 1003cabdff1aSopenharmony_ci res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \ 1004cabdff1aSopenharmony_ci res1_m = __msa_splati_d(res0_m, 1); \ 1005cabdff1aSopenharmony_ci res0_m += res1_m; \ 1006cabdff1aSopenharmony_ci sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \ 1007cabdff1aSopenharmony_ci sum_m; \ 1008cabdff1aSopenharmony_ci} ) 1009cabdff1aSopenharmony_ci 1010cabdff1aSopenharmony_ci/* Description : Addition of 8 unsigned halfword elements 1011cabdff1aSopenharmony_ci 8 unsigned halfword elements of input vector are added 1012cabdff1aSopenharmony_ci together and resulted integer sum is returned 1013cabdff1aSopenharmony_ci Arguments : Inputs - in (unsigned halfword vector) 1014cabdff1aSopenharmony_ci Outputs - sum_m (u32 sum) 1015cabdff1aSopenharmony_ci Return Type - unsigned word 1016cabdff1aSopenharmony_ci*/ 1017cabdff1aSopenharmony_ci#define HADD_UH_U32(in) \ 1018cabdff1aSopenharmony_ci( { \ 1019cabdff1aSopenharmony_ci v4u32 res_m; \ 1020cabdff1aSopenharmony_ci v2u64 res0_m, res1_m; \ 1021cabdff1aSopenharmony_ci uint32_t sum_m; \ 1022cabdff1aSopenharmony_ci \ 1023cabdff1aSopenharmony_ci res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \ 1024cabdff1aSopenharmony_ci res0_m = __msa_hadd_u_d(res_m, res_m); \ 1025cabdff1aSopenharmony_ci res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \ 1026cabdff1aSopenharmony_ci res0_m += res1_m; \ 1027cabdff1aSopenharmony_ci sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \ 1028cabdff1aSopenharmony_ci sum_m; \ 1029cabdff1aSopenharmony_ci} ) 1030cabdff1aSopenharmony_ci 1031cabdff1aSopenharmony_ci/* Description : Horizontal addition of signed byte vector elements 1032cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 1033cabdff1aSopenharmony_ci Outputs - out0, out1 1034cabdff1aSopenharmony_ci Return Type - as per RTYPE 1035cabdff1aSopenharmony_ci Details : Each signed odd byte element from 'in0' is added to 1036cabdff1aSopenharmony_ci even signed byte element from 'in0' (pairwise) and the 1037cabdff1aSopenharmony_ci halfword result is stored in 'out0' 1038cabdff1aSopenharmony_ci*/ 1039cabdff1aSopenharmony_ci#define HADD_SB2(RTYPE, in0, in1, out0, out1) \ 1040cabdff1aSopenharmony_ci{ \ 1041cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \ 1042cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \ 1043cabdff1aSopenharmony_ci} 1044cabdff1aSopenharmony_ci#define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__) 1045cabdff1aSopenharmony_ci 1046cabdff1aSopenharmony_ci#define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 1047cabdff1aSopenharmony_ci{ \ 1048cabdff1aSopenharmony_ci HADD_SB2(RTYPE, in0, in1, out0, out1); \ 1049cabdff1aSopenharmony_ci HADD_SB2(RTYPE, in2, in3, out2, out3); \ 1050cabdff1aSopenharmony_ci} 1051cabdff1aSopenharmony_ci#define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__) 1052cabdff1aSopenharmony_ci#define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__) 1053cabdff1aSopenharmony_ci 1054cabdff1aSopenharmony_ci/* Description : Horizontal addition of unsigned byte vector elements 1055cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 1056cabdff1aSopenharmony_ci Outputs - out0, out1 1057cabdff1aSopenharmony_ci Return Type - as per RTYPE 1058cabdff1aSopenharmony_ci Details : Each unsigned odd byte element from 'in0' is added to 1059cabdff1aSopenharmony_ci even unsigned byte element from 'in0' (pairwise) and the 1060cabdff1aSopenharmony_ci halfword result is stored in 'out0' 1061cabdff1aSopenharmony_ci*/ 1062cabdff1aSopenharmony_ci#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 1063cabdff1aSopenharmony_ci{ \ 1064cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \ 1065cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \ 1066cabdff1aSopenharmony_ci} 1067cabdff1aSopenharmony_ci#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 1068cabdff1aSopenharmony_ci 1069cabdff1aSopenharmony_ci#define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \ 1070cabdff1aSopenharmony_ci{ \ 1071cabdff1aSopenharmony_ci HADD_UB2(RTYPE, in0, in1, out0, out1); \ 1072cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \ 1073cabdff1aSopenharmony_ci} 1074cabdff1aSopenharmony_ci#define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__) 1075cabdff1aSopenharmony_ci 1076cabdff1aSopenharmony_ci#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 1077cabdff1aSopenharmony_ci{ \ 1078cabdff1aSopenharmony_ci HADD_UB2(RTYPE, in0, in1, out0, out1); \ 1079cabdff1aSopenharmony_ci HADD_UB2(RTYPE, in2, in3, out2, out3); \ 1080cabdff1aSopenharmony_ci} 1081cabdff1aSopenharmony_ci#define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__) 1082cabdff1aSopenharmony_ci#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 1083cabdff1aSopenharmony_ci#define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__) 1084cabdff1aSopenharmony_ci 1085cabdff1aSopenharmony_ci/* Description : Horizontal subtraction of unsigned byte vector elements 1086cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 1087cabdff1aSopenharmony_ci Outputs - out0, out1 1088cabdff1aSopenharmony_ci Return Type - as per RTYPE 1089cabdff1aSopenharmony_ci Details : Each unsigned odd byte element from 'in0' is subtracted from 1090cabdff1aSopenharmony_ci even unsigned byte element from 'in0' (pairwise) and the 1091cabdff1aSopenharmony_ci halfword result is stored in 'out0' 1092cabdff1aSopenharmony_ci*/ 1093cabdff1aSopenharmony_ci#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 1094cabdff1aSopenharmony_ci{ \ 1095cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \ 1096cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \ 1097cabdff1aSopenharmony_ci} 1098cabdff1aSopenharmony_ci#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__) 1099cabdff1aSopenharmony_ci#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 1100cabdff1aSopenharmony_ci 1101cabdff1aSopenharmony_ci#define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 1102cabdff1aSopenharmony_ci{ \ 1103cabdff1aSopenharmony_ci HSUB_UB2(RTYPE, in0, in1, out0, out1); \ 1104cabdff1aSopenharmony_ci HSUB_UB2(RTYPE, in2, in3, out2, out3); \ 1105cabdff1aSopenharmony_ci} 1106cabdff1aSopenharmony_ci#define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__) 1107cabdff1aSopenharmony_ci#define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__) 1108cabdff1aSopenharmony_ci 1109cabdff1aSopenharmony_ci/* Description : SAD (Sum of Absolute Difference) 1110cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref) 1111cabdff1aSopenharmony_ci Outputs - sad_m (halfword vector with sad) 1112cabdff1aSopenharmony_ci Return Type - unsigned halfword 1113cabdff1aSopenharmony_ci Details : Absolute difference of all the byte elements from 'in0' with 1114cabdff1aSopenharmony_ci 'ref0' is calculated and preserved in 'diff0'. From the 16 1115cabdff1aSopenharmony_ci unsigned absolute diff values, even-odd pairs are added 1116cabdff1aSopenharmony_ci together to generate 8 halfword results. 1117cabdff1aSopenharmony_ci*/ 1118cabdff1aSopenharmony_ci#define SAD_UB2_UH(in0, in1, ref0, ref1) \ 1119cabdff1aSopenharmony_ci( { \ 1120cabdff1aSopenharmony_ci v16u8 diff0_m, diff1_m; \ 1121cabdff1aSopenharmony_ci v8u16 sad_m = { 0 }; \ 1122cabdff1aSopenharmony_ci \ 1123cabdff1aSopenharmony_ci diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \ 1124cabdff1aSopenharmony_ci diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \ 1125cabdff1aSopenharmony_ci \ 1126cabdff1aSopenharmony_ci sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \ 1127cabdff1aSopenharmony_ci sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \ 1128cabdff1aSopenharmony_ci \ 1129cabdff1aSopenharmony_ci sad_m; \ 1130cabdff1aSopenharmony_ci} ) 1131cabdff1aSopenharmony_ci 1132cabdff1aSopenharmony_ci/* Description : Insert specified word elements from input vectors to 1 1133cabdff1aSopenharmony_ci destination vector 1134cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) 1135cabdff1aSopenharmony_ci Outputs - out (output vector) 1136cabdff1aSopenharmony_ci Return Type - as per RTYPE 1137cabdff1aSopenharmony_ci*/ 1138cabdff1aSopenharmony_ci#define INSERT_W2(RTYPE, in0, in1, out) \ 1139cabdff1aSopenharmony_ci{ \ 1140cabdff1aSopenharmony_ci out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \ 1141cabdff1aSopenharmony_ci out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \ 1142cabdff1aSopenharmony_ci} 1143cabdff1aSopenharmony_ci#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__) 1144cabdff1aSopenharmony_ci#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 1145cabdff1aSopenharmony_ci 1146cabdff1aSopenharmony_ci#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ 1147cabdff1aSopenharmony_ci{ \ 1148cabdff1aSopenharmony_ci out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \ 1149cabdff1aSopenharmony_ci out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \ 1150cabdff1aSopenharmony_ci out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \ 1151cabdff1aSopenharmony_ci out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \ 1152cabdff1aSopenharmony_ci} 1153cabdff1aSopenharmony_ci#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 1154cabdff1aSopenharmony_ci#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 1155cabdff1aSopenharmony_ci#define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__) 1156cabdff1aSopenharmony_ci#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__) 1157cabdff1aSopenharmony_ci 1158cabdff1aSopenharmony_ci/* Description : Insert specified double word elements from input vectors to 1 1159cabdff1aSopenharmony_ci destination vector 1160cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 (2 input vectors) 1161cabdff1aSopenharmony_ci Outputs - out (output vector) 1162cabdff1aSopenharmony_ci Return Type - as per RTYPE 1163cabdff1aSopenharmony_ci*/ 1164cabdff1aSopenharmony_ci#define INSERT_D2(RTYPE, in0, in1, out) \ 1165cabdff1aSopenharmony_ci{ \ 1166cabdff1aSopenharmony_ci out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \ 1167cabdff1aSopenharmony_ci out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \ 1168cabdff1aSopenharmony_ci} 1169cabdff1aSopenharmony_ci#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 1170cabdff1aSopenharmony_ci#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 1171cabdff1aSopenharmony_ci#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) 1172cabdff1aSopenharmony_ci#define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__) 1173cabdff1aSopenharmony_ci 1174cabdff1aSopenharmony_ci/* Description : Interleave even byte elements from vectors 1175cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1176cabdff1aSopenharmony_ci Outputs - out0, out1 1177cabdff1aSopenharmony_ci Return Type - as per RTYPE 1178cabdff1aSopenharmony_ci Details : Even byte elements of 'in0' and even byte 1179cabdff1aSopenharmony_ci elements of 'in1' are interleaved and copied to 'out0' 1180cabdff1aSopenharmony_ci Even byte elements of 'in2' and even byte 1181cabdff1aSopenharmony_ci elements of 'in3' are interleaved and copied to 'out1' 1182cabdff1aSopenharmony_ci*/ 1183cabdff1aSopenharmony_ci#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1184cabdff1aSopenharmony_ci{ \ 1185cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \ 1186cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \ 1187cabdff1aSopenharmony_ci} 1188cabdff1aSopenharmony_ci#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 1189cabdff1aSopenharmony_ci#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__) 1190cabdff1aSopenharmony_ci#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 1191cabdff1aSopenharmony_ci#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__) 1192cabdff1aSopenharmony_ci 1193cabdff1aSopenharmony_ci/* Description : Interleave even halfword elements from vectors 1194cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1195cabdff1aSopenharmony_ci Outputs - out0, out1 1196cabdff1aSopenharmony_ci Return Type - as per RTYPE 1197cabdff1aSopenharmony_ci Details : Even halfword elements of 'in0' and even halfword 1198cabdff1aSopenharmony_ci elements of 'in1' are interleaved and copied to 'out0' 1199cabdff1aSopenharmony_ci Even halfword elements of 'in2' and even halfword 1200cabdff1aSopenharmony_ci elements of 'in3' are interleaved and copied to 'out1' 1201cabdff1aSopenharmony_ci*/ 1202cabdff1aSopenharmony_ci#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1203cabdff1aSopenharmony_ci{ \ 1204cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \ 1205cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \ 1206cabdff1aSopenharmony_ci} 1207cabdff1aSopenharmony_ci#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 1208cabdff1aSopenharmony_ci#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 1209cabdff1aSopenharmony_ci#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 1210cabdff1aSopenharmony_ci 1211cabdff1aSopenharmony_ci/* Description : Interleave even word elements from vectors 1212cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1213cabdff1aSopenharmony_ci Outputs - out0, out1 1214cabdff1aSopenharmony_ci Return Type - as per RTYPE 1215cabdff1aSopenharmony_ci Details : Even word elements of 'in0' and even word 1216cabdff1aSopenharmony_ci elements of 'in1' are interleaved and copied to 'out0' 1217cabdff1aSopenharmony_ci Even word elements of 'in2' and even word 1218cabdff1aSopenharmony_ci elements of 'in3' are interleaved and copied to 'out1' 1219cabdff1aSopenharmony_ci*/ 1220cabdff1aSopenharmony_ci#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1221cabdff1aSopenharmony_ci{ \ 1222cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \ 1223cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \ 1224cabdff1aSopenharmony_ci} 1225cabdff1aSopenharmony_ci#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__) 1226cabdff1aSopenharmony_ci#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 1227cabdff1aSopenharmony_ci#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__) 1228cabdff1aSopenharmony_ci#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__) 1229cabdff1aSopenharmony_ci 1230cabdff1aSopenharmony_ci/* Description : Interleave even double word elements from vectors 1231cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1232cabdff1aSopenharmony_ci Outputs - out0, out1 1233cabdff1aSopenharmony_ci Return Type - as per RTYPE 1234cabdff1aSopenharmony_ci Details : Even double word elements of 'in0' and even double word 1235cabdff1aSopenharmony_ci elements of 'in1' are interleaved and copied to 'out0' 1236cabdff1aSopenharmony_ci Even double word elements of 'in2' and even double word 1237cabdff1aSopenharmony_ci elements of 'in3' are interleaved and copied to 'out1' 1238cabdff1aSopenharmony_ci*/ 1239cabdff1aSopenharmony_ci#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1240cabdff1aSopenharmony_ci{ \ 1241cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \ 1242cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \ 1243cabdff1aSopenharmony_ci} 1244cabdff1aSopenharmony_ci#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 1245cabdff1aSopenharmony_ci#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__) 1246cabdff1aSopenharmony_ci#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__) 1247cabdff1aSopenharmony_ci 1248cabdff1aSopenharmony_ci/* Description : Interleave left half of byte elements from vectors 1249cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1250cabdff1aSopenharmony_ci Outputs - out0, out1 1251cabdff1aSopenharmony_ci Return Type - as per RTYPE 1252cabdff1aSopenharmony_ci Details : Left half of byte elements of in0 and left half of byte 1253cabdff1aSopenharmony_ci elements of in1 are interleaved and copied to out0. 1254cabdff1aSopenharmony_ci Left half of byte elements of in2 and left half of byte 1255cabdff1aSopenharmony_ci elements of in3 are interleaved and copied to out1. 1256cabdff1aSopenharmony_ci*/ 1257cabdff1aSopenharmony_ci#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1258cabdff1aSopenharmony_ci{ \ 1259cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ 1260cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \ 1261cabdff1aSopenharmony_ci} 1262cabdff1aSopenharmony_ci#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 1263cabdff1aSopenharmony_ci#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 1264cabdff1aSopenharmony_ci#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 1265cabdff1aSopenharmony_ci#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 1266cabdff1aSopenharmony_ci 1267cabdff1aSopenharmony_ci#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1268cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1269cabdff1aSopenharmony_ci{ \ 1270cabdff1aSopenharmony_ci ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1271cabdff1aSopenharmony_ci ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1272cabdff1aSopenharmony_ci} 1273cabdff1aSopenharmony_ci#define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__) 1274cabdff1aSopenharmony_ci#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 1275cabdff1aSopenharmony_ci#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 1276cabdff1aSopenharmony_ci#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 1277cabdff1aSopenharmony_ci 1278cabdff1aSopenharmony_ci/* Description : Interleave left half of halfword elements from vectors 1279cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1280cabdff1aSopenharmony_ci Outputs - out0, out1 1281cabdff1aSopenharmony_ci Return Type - as per RTYPE 1282cabdff1aSopenharmony_ci Details : Left half of halfword elements of in0 and left half of halfword 1283cabdff1aSopenharmony_ci elements of in1 are interleaved and copied to out0. 1284cabdff1aSopenharmony_ci Left half of halfword elements of in2 and left half of halfword 1285cabdff1aSopenharmony_ci elements of in3 are interleaved and copied to out1. 1286cabdff1aSopenharmony_ci*/ 1287cabdff1aSopenharmony_ci#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1288cabdff1aSopenharmony_ci{ \ 1289cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \ 1290cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \ 1291cabdff1aSopenharmony_ci} 1292cabdff1aSopenharmony_ci#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 1293cabdff1aSopenharmony_ci#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 1294cabdff1aSopenharmony_ci 1295cabdff1aSopenharmony_ci#define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1296cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1297cabdff1aSopenharmony_ci{ \ 1298cabdff1aSopenharmony_ci ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1299cabdff1aSopenharmony_ci ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1300cabdff1aSopenharmony_ci} 1301cabdff1aSopenharmony_ci#define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__) 1302cabdff1aSopenharmony_ci#define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__) 1303cabdff1aSopenharmony_ci 1304cabdff1aSopenharmony_ci/* Description : Interleave left half of word elements from vectors 1305cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1306cabdff1aSopenharmony_ci Outputs - out0, out1 1307cabdff1aSopenharmony_ci Return Type - as per RTYPE 1308cabdff1aSopenharmony_ci Details : Left half of word elements of in0 and left half of word 1309cabdff1aSopenharmony_ci elements of in1 are interleaved and copied to out0. 1310cabdff1aSopenharmony_ci Left half of word elements of in2 and left half of word 1311cabdff1aSopenharmony_ci elements of in3 are interleaved and copied to out1. 1312cabdff1aSopenharmony_ci*/ 1313cabdff1aSopenharmony_ci#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1314cabdff1aSopenharmony_ci{ \ 1315cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ 1316cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \ 1317cabdff1aSopenharmony_ci} 1318cabdff1aSopenharmony_ci#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 1319cabdff1aSopenharmony_ci#define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__) 1320cabdff1aSopenharmony_ci#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1321cabdff1aSopenharmony_ci 1322cabdff1aSopenharmony_ci/* Description : Interleave right half of byte elements from vectors 1323cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1324cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 1325cabdff1aSopenharmony_ci Return Type - as per RTYPE 1326cabdff1aSopenharmony_ci Details : Right half of byte elements of in0 and right half of byte 1327cabdff1aSopenharmony_ci elements of in1 are interleaved and copied to out0. 1328cabdff1aSopenharmony_ci Right half of byte elements of in2 and right half of byte 1329cabdff1aSopenharmony_ci elements of in3 are interleaved and copied to out1. 1330cabdff1aSopenharmony_ci Similar for other pairs 1331cabdff1aSopenharmony_ci*/ 1332cabdff1aSopenharmony_ci#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1333cabdff1aSopenharmony_ci{ \ 1334cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ 1335cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \ 1336cabdff1aSopenharmony_ci} 1337cabdff1aSopenharmony_ci#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1338cabdff1aSopenharmony_ci#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1339cabdff1aSopenharmony_ci#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 1340cabdff1aSopenharmony_ci#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1341cabdff1aSopenharmony_ci#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) 1342cabdff1aSopenharmony_ci 1343cabdff1aSopenharmony_ci#define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1344cabdff1aSopenharmony_ci{ \ 1345cabdff1aSopenharmony_ci ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1346cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \ 1347cabdff1aSopenharmony_ci} 1348cabdff1aSopenharmony_ci#define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__) 1349cabdff1aSopenharmony_ci#define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__) 1350cabdff1aSopenharmony_ci#define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__) 1351cabdff1aSopenharmony_ci#define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__) 1352cabdff1aSopenharmony_ci 1353cabdff1aSopenharmony_ci#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1354cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1355cabdff1aSopenharmony_ci{ \ 1356cabdff1aSopenharmony_ci ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1357cabdff1aSopenharmony_ci ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1358cabdff1aSopenharmony_ci} 1359cabdff1aSopenharmony_ci#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1360cabdff1aSopenharmony_ci#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1361cabdff1aSopenharmony_ci#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1362cabdff1aSopenharmony_ci#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1363cabdff1aSopenharmony_ci#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) 1364cabdff1aSopenharmony_ci 1365cabdff1aSopenharmony_ci#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1366cabdff1aSopenharmony_ci in8, in9, in10, in11, in12, in13, in14, in15, \ 1367cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 1368cabdff1aSopenharmony_ci{ \ 1369cabdff1aSopenharmony_ci ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1370cabdff1aSopenharmony_ci out0, out1, out2, out3); \ 1371cabdff1aSopenharmony_ci ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ 1372cabdff1aSopenharmony_ci out4, out5, out6, out7); \ 1373cabdff1aSopenharmony_ci} 1374cabdff1aSopenharmony_ci#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 1375cabdff1aSopenharmony_ci#define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__) 1376cabdff1aSopenharmony_ci 1377cabdff1aSopenharmony_ci/* Description : Interleave right half of halfword elements from vectors 1378cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1379cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 1380cabdff1aSopenharmony_ci Return Type - as per RTYPE 1381cabdff1aSopenharmony_ci Details : Right half of halfword elements of in0 and right half of 1382cabdff1aSopenharmony_ci halfword elements of in1 are interleaved and copied to out0. 1383cabdff1aSopenharmony_ci Right half of halfword elements of in2 and right half of 1384cabdff1aSopenharmony_ci halfword elements of in3 are interleaved and copied to out1. 1385cabdff1aSopenharmony_ci Similar for other pairs 1386cabdff1aSopenharmony_ci*/ 1387cabdff1aSopenharmony_ci#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1388cabdff1aSopenharmony_ci{ \ 1389cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \ 1390cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \ 1391cabdff1aSopenharmony_ci} 1392cabdff1aSopenharmony_ci#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1393cabdff1aSopenharmony_ci#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1394cabdff1aSopenharmony_ci 1395cabdff1aSopenharmony_ci#define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1396cabdff1aSopenharmony_ci{ \ 1397cabdff1aSopenharmony_ci ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1398cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \ 1399cabdff1aSopenharmony_ci} 1400cabdff1aSopenharmony_ci#define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__) 1401cabdff1aSopenharmony_ci 1402cabdff1aSopenharmony_ci#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1403cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1404cabdff1aSopenharmony_ci{ \ 1405cabdff1aSopenharmony_ci ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1406cabdff1aSopenharmony_ci ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1407cabdff1aSopenharmony_ci} 1408cabdff1aSopenharmony_ci#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1409cabdff1aSopenharmony_ci#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) 1410cabdff1aSopenharmony_ci 1411cabdff1aSopenharmony_ci#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1412cabdff1aSopenharmony_ci{ \ 1413cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ 1414cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \ 1415cabdff1aSopenharmony_ci} 1416cabdff1aSopenharmony_ci#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 1417cabdff1aSopenharmony_ci#define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__) 1418cabdff1aSopenharmony_ci#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1419cabdff1aSopenharmony_ci 1420cabdff1aSopenharmony_ci#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1421cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1422cabdff1aSopenharmony_ci{ \ 1423cabdff1aSopenharmony_ci ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1424cabdff1aSopenharmony_ci ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1425cabdff1aSopenharmony_ci} 1426cabdff1aSopenharmony_ci#define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__) 1427cabdff1aSopenharmony_ci#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 1428cabdff1aSopenharmony_ci 1429cabdff1aSopenharmony_ci/* Description : Interleave right half of double word elements from vectors 1430cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1431cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 1432cabdff1aSopenharmony_ci Return Type - as per RTYPE 1433cabdff1aSopenharmony_ci Details : Right half of double word elements of in0 and right half of 1434cabdff1aSopenharmony_ci double word elements of in1 are interleaved and copied to out0. 1435cabdff1aSopenharmony_ci Right half of double word elements of in2 and right half of 1436cabdff1aSopenharmony_ci double word elements of in3 are interleaved and copied to out1. 1437cabdff1aSopenharmony_ci*/ 1438cabdff1aSopenharmony_ci#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1439cabdff1aSopenharmony_ci{ \ 1440cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ 1441cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \ 1442cabdff1aSopenharmony_ci} 1443cabdff1aSopenharmony_ci#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1444cabdff1aSopenharmony_ci#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1445cabdff1aSopenharmony_ci#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1446cabdff1aSopenharmony_ci 1447cabdff1aSopenharmony_ci#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1448cabdff1aSopenharmony_ci{ \ 1449cabdff1aSopenharmony_ci ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1450cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \ 1451cabdff1aSopenharmony_ci} 1452cabdff1aSopenharmony_ci#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 1453cabdff1aSopenharmony_ci 1454cabdff1aSopenharmony_ci#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1455cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1456cabdff1aSopenharmony_ci{ \ 1457cabdff1aSopenharmony_ci ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1458cabdff1aSopenharmony_ci ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1459cabdff1aSopenharmony_ci} 1460cabdff1aSopenharmony_ci#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1461cabdff1aSopenharmony_ci#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1462cabdff1aSopenharmony_ci 1463cabdff1aSopenharmony_ci/* Description : Interleave left half of double word elements from vectors 1464cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1465cabdff1aSopenharmony_ci Outputs - out0, out1 1466cabdff1aSopenharmony_ci Return Type - as per RTYPE 1467cabdff1aSopenharmony_ci Details : Left half of double word elements of in0 and left half of 1468cabdff1aSopenharmony_ci double word elements of in1 are interleaved and copied to out0. 1469cabdff1aSopenharmony_ci Left half of double word elements of in2 and left half of 1470cabdff1aSopenharmony_ci double word elements of in3 are interleaved and copied to out1. 1471cabdff1aSopenharmony_ci*/ 1472cabdff1aSopenharmony_ci#define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1473cabdff1aSopenharmony_ci{ \ 1474cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ 1475cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \ 1476cabdff1aSopenharmony_ci} 1477cabdff1aSopenharmony_ci#define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__) 1478cabdff1aSopenharmony_ci#define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__) 1479cabdff1aSopenharmony_ci#define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__) 1480cabdff1aSopenharmony_ci 1481cabdff1aSopenharmony_ci/* Description : Interleave both left and right half of input vectors 1482cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 1483cabdff1aSopenharmony_ci Outputs - out0, out1 1484cabdff1aSopenharmony_ci Return Type - as per RTYPE 1485cabdff1aSopenharmony_ci Details : Right half of byte elements from 'in0' and 'in1' are 1486cabdff1aSopenharmony_ci interleaved and stored to 'out0' 1487cabdff1aSopenharmony_ci Left half of byte elements from 'in0' and 'in1' are 1488cabdff1aSopenharmony_ci interleaved and stored to 'out1' 1489cabdff1aSopenharmony_ci*/ 1490cabdff1aSopenharmony_ci#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1491cabdff1aSopenharmony_ci{ \ 1492cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ 1493cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ 1494cabdff1aSopenharmony_ci} 1495cabdff1aSopenharmony_ci#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1496cabdff1aSopenharmony_ci#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1497cabdff1aSopenharmony_ci#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1498cabdff1aSopenharmony_ci#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1499cabdff1aSopenharmony_ci#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__) 1500cabdff1aSopenharmony_ci 1501cabdff1aSopenharmony_ci#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1502cabdff1aSopenharmony_ci{ \ 1503cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \ 1504cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \ 1505cabdff1aSopenharmony_ci} 1506cabdff1aSopenharmony_ci#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) 1507cabdff1aSopenharmony_ci#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) 1508cabdff1aSopenharmony_ci#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1509cabdff1aSopenharmony_ci#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1510cabdff1aSopenharmony_ci 1511cabdff1aSopenharmony_ci#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1512cabdff1aSopenharmony_ci{ \ 1513cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ 1514cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ 1515cabdff1aSopenharmony_ci} 1516cabdff1aSopenharmony_ci#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1517cabdff1aSopenharmony_ci#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1518cabdff1aSopenharmony_ci#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1519cabdff1aSopenharmony_ci 1520cabdff1aSopenharmony_ci/* Description : Maximum values between signed elements of vector and 1521cabdff1aSopenharmony_ci 5-bit signed immediate value are copied to the output vector 1522cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, max_val 1523cabdff1aSopenharmony_ci Outputs - in0, in1, in2, in3 (in place) 1524cabdff1aSopenharmony_ci Return Type - as per RTYPE 1525cabdff1aSopenharmony_ci Details : Maximum of signed halfword element values from 'in0' and 1526cabdff1aSopenharmony_ci 'max_val' are written to output vector 'in0' 1527cabdff1aSopenharmony_ci*/ 1528cabdff1aSopenharmony_ci#define MAXI_SH2(RTYPE, in0, in1, max_val) \ 1529cabdff1aSopenharmony_ci{ \ 1530cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \ 1531cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \ 1532cabdff1aSopenharmony_ci} 1533cabdff1aSopenharmony_ci#define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__) 1534cabdff1aSopenharmony_ci#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) 1535cabdff1aSopenharmony_ci 1536cabdff1aSopenharmony_ci#define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \ 1537cabdff1aSopenharmony_ci{ \ 1538cabdff1aSopenharmony_ci MAXI_SH2(RTYPE, in0, in1, max_val); \ 1539cabdff1aSopenharmony_ci MAXI_SH2(RTYPE, in2, in3, max_val); \ 1540cabdff1aSopenharmony_ci} 1541cabdff1aSopenharmony_ci#define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__) 1542cabdff1aSopenharmony_ci#define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__) 1543cabdff1aSopenharmony_ci 1544cabdff1aSopenharmony_ci#define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \ 1545cabdff1aSopenharmony_ci{ \ 1546cabdff1aSopenharmony_ci MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \ 1547cabdff1aSopenharmony_ci MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \ 1548cabdff1aSopenharmony_ci} 1549cabdff1aSopenharmony_ci#define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__) 1550cabdff1aSopenharmony_ci#define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__) 1551cabdff1aSopenharmony_ci 1552cabdff1aSopenharmony_ci/* Description : Saturate the halfword element values to the max 1553cabdff1aSopenharmony_ci unsigned value of (sat_val+1 bits) 1554cabdff1aSopenharmony_ci The element data width remains unchanged 1555cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, sat_val 1556cabdff1aSopenharmony_ci Outputs - in0, in1, in2, in3 (in place) 1557cabdff1aSopenharmony_ci Return Type - as per RTYPE 1558cabdff1aSopenharmony_ci Details : Each unsigned halfword element from 'in0' is saturated to the 1559cabdff1aSopenharmony_ci value generated with (sat_val+1) bit range 1560cabdff1aSopenharmony_ci Results are in placed to original vectors 1561cabdff1aSopenharmony_ci*/ 1562cabdff1aSopenharmony_ci#define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1563cabdff1aSopenharmony_ci{ \ 1564cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \ 1565cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \ 1566cabdff1aSopenharmony_ci} 1567cabdff1aSopenharmony_ci#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 1568cabdff1aSopenharmony_ci#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__) 1569cabdff1aSopenharmony_ci 1570cabdff1aSopenharmony_ci#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1571cabdff1aSopenharmony_ci{ \ 1572cabdff1aSopenharmony_ci SAT_UH2(RTYPE, in0, in1, sat_val); \ 1573cabdff1aSopenharmony_ci SAT_UH2(RTYPE, in2, in3, sat_val); \ 1574cabdff1aSopenharmony_ci} 1575cabdff1aSopenharmony_ci#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 1576cabdff1aSopenharmony_ci#define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__) 1577cabdff1aSopenharmony_ci 1578cabdff1aSopenharmony_ci#define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \ 1579cabdff1aSopenharmony_ci{ \ 1580cabdff1aSopenharmony_ci SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \ 1581cabdff1aSopenharmony_ci SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \ 1582cabdff1aSopenharmony_ci} 1583cabdff1aSopenharmony_ci#define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__) 1584cabdff1aSopenharmony_ci#define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__) 1585cabdff1aSopenharmony_ci 1586cabdff1aSopenharmony_ci/* Description : Saturate the halfword element values to the max 1587cabdff1aSopenharmony_ci unsigned value of (sat_val+1 bits) 1588cabdff1aSopenharmony_ci The element data width remains unchanged 1589cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, sat_val 1590cabdff1aSopenharmony_ci Outputs - in0, in1, in2, in3 (in place) 1591cabdff1aSopenharmony_ci Return Type - as per RTYPE 1592cabdff1aSopenharmony_ci Details : Each unsigned halfword element from 'in0' is saturated to the 1593cabdff1aSopenharmony_ci value generated with (sat_val+1) bit range 1594cabdff1aSopenharmony_ci Results are in placed to original vectors 1595cabdff1aSopenharmony_ci*/ 1596cabdff1aSopenharmony_ci#define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1597cabdff1aSopenharmony_ci{ \ 1598cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \ 1599cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \ 1600cabdff1aSopenharmony_ci} 1601cabdff1aSopenharmony_ci#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1602cabdff1aSopenharmony_ci 1603cabdff1aSopenharmony_ci#define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \ 1604cabdff1aSopenharmony_ci{ \ 1605cabdff1aSopenharmony_ci SAT_SH2(RTYPE, in0, in1, sat_val); \ 1606cabdff1aSopenharmony_ci in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \ 1607cabdff1aSopenharmony_ci} 1608cabdff1aSopenharmony_ci#define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__) 1609cabdff1aSopenharmony_ci 1610cabdff1aSopenharmony_ci#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1611cabdff1aSopenharmony_ci{ \ 1612cabdff1aSopenharmony_ci SAT_SH2(RTYPE, in0, in1, sat_val); \ 1613cabdff1aSopenharmony_ci SAT_SH2(RTYPE, in2, in3, sat_val); \ 1614cabdff1aSopenharmony_ci} 1615cabdff1aSopenharmony_ci#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1616cabdff1aSopenharmony_ci 1617cabdff1aSopenharmony_ci/* Description : Saturate the word element values to the max 1618cabdff1aSopenharmony_ci unsigned value of (sat_val+1 bits) 1619cabdff1aSopenharmony_ci The element data width remains unchanged 1620cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, sat_val 1621cabdff1aSopenharmony_ci Outputs - in0, in1, in2, in3 (in place) 1622cabdff1aSopenharmony_ci Return Type - as per RTYPE 1623cabdff1aSopenharmony_ci Details : Each unsigned word element from 'in0' is saturated to the 1624cabdff1aSopenharmony_ci value generated with (sat_val+1) bit range 1625cabdff1aSopenharmony_ci Results are in placed to original vectors 1626cabdff1aSopenharmony_ci*/ 1627cabdff1aSopenharmony_ci#define SAT_SW2(RTYPE, in0, in1, sat_val) \ 1628cabdff1aSopenharmony_ci{ \ 1629cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \ 1630cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \ 1631cabdff1aSopenharmony_ci} 1632cabdff1aSopenharmony_ci#define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__) 1633cabdff1aSopenharmony_ci 1634cabdff1aSopenharmony_ci#define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \ 1635cabdff1aSopenharmony_ci{ \ 1636cabdff1aSopenharmony_ci SAT_SW2(RTYPE, in0, in1, sat_val); \ 1637cabdff1aSopenharmony_ci SAT_SW2(RTYPE, in2, in3, sat_val); \ 1638cabdff1aSopenharmony_ci} 1639cabdff1aSopenharmony_ci#define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__) 1640cabdff1aSopenharmony_ci 1641cabdff1aSopenharmony_ci/* Description : Indexed halfword element values are replicated to all 1642cabdff1aSopenharmony_ci elements in output vector 1643cabdff1aSopenharmony_ci Arguments : Inputs - in, idx0, idx1 1644cabdff1aSopenharmony_ci Outputs - out0, out1 1645cabdff1aSopenharmony_ci Return Type - as per RTYPE 1646cabdff1aSopenharmony_ci Details : 'idx0' element value from 'in' vector is replicated to all 1647cabdff1aSopenharmony_ci elements in 'out0' vector 1648cabdff1aSopenharmony_ci Valid index range for halfword operation is 0-7 1649cabdff1aSopenharmony_ci*/ 1650cabdff1aSopenharmony_ci#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1651cabdff1aSopenharmony_ci{ \ 1652cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \ 1653cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \ 1654cabdff1aSopenharmony_ci} 1655cabdff1aSopenharmony_ci#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__) 1656cabdff1aSopenharmony_ci#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1657cabdff1aSopenharmony_ci 1658cabdff1aSopenharmony_ci#define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \ 1659cabdff1aSopenharmony_ci out0, out1, out2) \ 1660cabdff1aSopenharmony_ci{ \ 1661cabdff1aSopenharmony_ci SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1662cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \ 1663cabdff1aSopenharmony_ci} 1664cabdff1aSopenharmony_ci#define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__) 1665cabdff1aSopenharmony_ci#define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__) 1666cabdff1aSopenharmony_ci 1667cabdff1aSopenharmony_ci#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ 1668cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1669cabdff1aSopenharmony_ci{ \ 1670cabdff1aSopenharmony_ci SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1671cabdff1aSopenharmony_ci SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ 1672cabdff1aSopenharmony_ci} 1673cabdff1aSopenharmony_ci#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 1674cabdff1aSopenharmony_ci#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 1675cabdff1aSopenharmony_ci 1676cabdff1aSopenharmony_ci/* Description : Indexed word element values are replicated to all 1677cabdff1aSopenharmony_ci elements in output vector 1678cabdff1aSopenharmony_ci Arguments : Inputs - in, stidx 1679cabdff1aSopenharmony_ci Outputs - out0, out1 1680cabdff1aSopenharmony_ci Return Type - as per RTYPE 1681cabdff1aSopenharmony_ci Details : 'stidx' element value from 'in' vector is replicated to all 1682cabdff1aSopenharmony_ci elements in 'out0' vector 1683cabdff1aSopenharmony_ci 'stidx + 1' element value from 'in' vector is replicated to all 1684cabdff1aSopenharmony_ci elements in 'out1' vector 1685cabdff1aSopenharmony_ci Valid index range for halfword operation is 0-3 1686cabdff1aSopenharmony_ci*/ 1687cabdff1aSopenharmony_ci#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ 1688cabdff1aSopenharmony_ci{ \ 1689cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ 1690cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ 1691cabdff1aSopenharmony_ci} 1692cabdff1aSopenharmony_ci#define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__) 1693cabdff1aSopenharmony_ci#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__) 1694cabdff1aSopenharmony_ci 1695cabdff1aSopenharmony_ci#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ 1696cabdff1aSopenharmony_ci{ \ 1697cabdff1aSopenharmony_ci SPLATI_W2(RTYPE, in, 0, out0, out1); \ 1698cabdff1aSopenharmony_ci SPLATI_W2(RTYPE, in, 2, out2, out3); \ 1699cabdff1aSopenharmony_ci} 1700cabdff1aSopenharmony_ci#define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__) 1701cabdff1aSopenharmony_ci#define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__) 1702cabdff1aSopenharmony_ci 1703cabdff1aSopenharmony_ci/* Description : Pack even byte elements of vector pairs 1704cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1705cabdff1aSopenharmony_ci Outputs - out0, out1 1706cabdff1aSopenharmony_ci Return Type - as per RTYPE 1707cabdff1aSopenharmony_ci Details : Even byte elements of in0 are copied to the left half of 1708cabdff1aSopenharmony_ci out0 & even byte elements of in1 are copied to the right 1709cabdff1aSopenharmony_ci half of out0. 1710cabdff1aSopenharmony_ci Even byte elements of in2 are copied to the left half of 1711cabdff1aSopenharmony_ci out1 & even byte elements of in3 are copied to the right 1712cabdff1aSopenharmony_ci half of out1. 1713cabdff1aSopenharmony_ci*/ 1714cabdff1aSopenharmony_ci#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1715cabdff1aSopenharmony_ci{ \ 1716cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ 1717cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \ 1718cabdff1aSopenharmony_ci} 1719cabdff1aSopenharmony_ci#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1720cabdff1aSopenharmony_ci#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1721cabdff1aSopenharmony_ci#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1722cabdff1aSopenharmony_ci#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) 1723cabdff1aSopenharmony_ci 1724cabdff1aSopenharmony_ci#define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1725cabdff1aSopenharmony_ci{ \ 1726cabdff1aSopenharmony_ci PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1727cabdff1aSopenharmony_ci out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \ 1728cabdff1aSopenharmony_ci} 1729cabdff1aSopenharmony_ci#define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__) 1730cabdff1aSopenharmony_ci#define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__) 1731cabdff1aSopenharmony_ci 1732cabdff1aSopenharmony_ci#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1733cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1734cabdff1aSopenharmony_ci{ \ 1735cabdff1aSopenharmony_ci PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1736cabdff1aSopenharmony_ci PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1737cabdff1aSopenharmony_ci} 1738cabdff1aSopenharmony_ci#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1739cabdff1aSopenharmony_ci#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1740cabdff1aSopenharmony_ci#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1741cabdff1aSopenharmony_ci#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__) 1742cabdff1aSopenharmony_ci 1743cabdff1aSopenharmony_ci/* Description : Pack even halfword elements of vector pairs 1744cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1745cabdff1aSopenharmony_ci Outputs - out0, out1 1746cabdff1aSopenharmony_ci Return Type - as per RTYPE 1747cabdff1aSopenharmony_ci Details : Even halfword elements of in0 are copied to the left half of 1748cabdff1aSopenharmony_ci out0 & even halfword elements of in1 are copied to the right 1749cabdff1aSopenharmony_ci half of out0. 1750cabdff1aSopenharmony_ci Even halfword elements of in2 are copied to the left half of 1751cabdff1aSopenharmony_ci out1 & even halfword elements of in3 are copied to the right 1752cabdff1aSopenharmony_ci half of out1. 1753cabdff1aSopenharmony_ci*/ 1754cabdff1aSopenharmony_ci#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1755cabdff1aSopenharmony_ci{ \ 1756cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \ 1757cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \ 1758cabdff1aSopenharmony_ci} 1759cabdff1aSopenharmony_ci#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1760cabdff1aSopenharmony_ci#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1761cabdff1aSopenharmony_ci 1762cabdff1aSopenharmony_ci#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1763cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1764cabdff1aSopenharmony_ci{ \ 1765cabdff1aSopenharmony_ci PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1766cabdff1aSopenharmony_ci PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1767cabdff1aSopenharmony_ci} 1768cabdff1aSopenharmony_ci#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1769cabdff1aSopenharmony_ci#define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__) 1770cabdff1aSopenharmony_ci 1771cabdff1aSopenharmony_ci/* Description : Pack even double word elements of vector pairs 1772cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1773cabdff1aSopenharmony_ci Outputs - out0, out1 1774cabdff1aSopenharmony_ci Return Type - as per RTYPE 1775cabdff1aSopenharmony_ci Details : Even double elements of in0 are copied to the left half of 1776cabdff1aSopenharmony_ci out0 & even double elements of in1 are copied to the right 1777cabdff1aSopenharmony_ci half of out0. 1778cabdff1aSopenharmony_ci Even double elements of in2 are copied to the left half of 1779cabdff1aSopenharmony_ci out1 & even double elements of in3 are copied to the right 1780cabdff1aSopenharmony_ci half of out1. 1781cabdff1aSopenharmony_ci*/ 1782cabdff1aSopenharmony_ci#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1783cabdff1aSopenharmony_ci{ \ 1784cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ 1785cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ 1786cabdff1aSopenharmony_ci} 1787cabdff1aSopenharmony_ci#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1788cabdff1aSopenharmony_ci#define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__) 1789cabdff1aSopenharmony_ci#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1790cabdff1aSopenharmony_ci 1791cabdff1aSopenharmony_ci#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1792cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1793cabdff1aSopenharmony_ci{ \ 1794cabdff1aSopenharmony_ci PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1795cabdff1aSopenharmony_ci PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1796cabdff1aSopenharmony_ci} 1797cabdff1aSopenharmony_ci#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 1798cabdff1aSopenharmony_ci 1799cabdff1aSopenharmony_ci/* Description : Pack odd double word elements of vector pairs 1800cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 1801cabdff1aSopenharmony_ci Outputs - out0, out1 1802cabdff1aSopenharmony_ci Return Type - as per RTYPE 1803cabdff1aSopenharmony_ci Details : As operation is on same input 'in0' vector, index 1 double word 1804cabdff1aSopenharmony_ci element is overwritten to index 0 and result is written to out0 1805cabdff1aSopenharmony_ci As operation is on same input 'in1' vector, index 1 double word 1806cabdff1aSopenharmony_ci element is overwritten to index 0 and result is written to out1 1807cabdff1aSopenharmony_ci*/ 1808cabdff1aSopenharmony_ci#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1809cabdff1aSopenharmony_ci{ \ 1810cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \ 1811cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \ 1812cabdff1aSopenharmony_ci} 1813cabdff1aSopenharmony_ci#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__) 1814cabdff1aSopenharmony_ci#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__) 1815cabdff1aSopenharmony_ci#define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__) 1816cabdff1aSopenharmony_ci 1817cabdff1aSopenharmony_ci/* Description : Each byte element is logically xor'ed with immediate 128 1818cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 1819cabdff1aSopenharmony_ci Outputs - in0, in1 (in-place) 1820cabdff1aSopenharmony_ci Return Type - as per RTYPE 1821cabdff1aSopenharmony_ci Details : Each unsigned byte element from input vector 'in0' is 1822cabdff1aSopenharmony_ci logically xor'ed with 128 and result is in-place stored in 1823cabdff1aSopenharmony_ci 'in0' vector 1824cabdff1aSopenharmony_ci Each unsigned byte element from input vector 'in1' is 1825cabdff1aSopenharmony_ci logically xor'ed with 128 and result is in-place stored in 1826cabdff1aSopenharmony_ci 'in1' vector 1827cabdff1aSopenharmony_ci Similar for other pairs 1828cabdff1aSopenharmony_ci*/ 1829cabdff1aSopenharmony_ci#define XORI_B2_128(RTYPE, in0, in1) \ 1830cabdff1aSopenharmony_ci{ \ 1831cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \ 1832cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \ 1833cabdff1aSopenharmony_ci} 1834cabdff1aSopenharmony_ci#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1835cabdff1aSopenharmony_ci#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1836cabdff1aSopenharmony_ci#define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__) 1837cabdff1aSopenharmony_ci 1838cabdff1aSopenharmony_ci#define XORI_B3_128(RTYPE, in0, in1, in2) \ 1839cabdff1aSopenharmony_ci{ \ 1840cabdff1aSopenharmony_ci XORI_B2_128(RTYPE, in0, in1); \ 1841cabdff1aSopenharmony_ci in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \ 1842cabdff1aSopenharmony_ci} 1843cabdff1aSopenharmony_ci#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1844cabdff1aSopenharmony_ci 1845cabdff1aSopenharmony_ci#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1846cabdff1aSopenharmony_ci{ \ 1847cabdff1aSopenharmony_ci XORI_B2_128(RTYPE, in0, in1); \ 1848cabdff1aSopenharmony_ci XORI_B2_128(RTYPE, in2, in3); \ 1849cabdff1aSopenharmony_ci} 1850cabdff1aSopenharmony_ci#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1851cabdff1aSopenharmony_ci#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1852cabdff1aSopenharmony_ci#define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__) 1853cabdff1aSopenharmony_ci 1854cabdff1aSopenharmony_ci#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \ 1855cabdff1aSopenharmony_ci{ \ 1856cabdff1aSopenharmony_ci XORI_B3_128(RTYPE, in0, in1, in2); \ 1857cabdff1aSopenharmony_ci XORI_B2_128(RTYPE, in3, in4); \ 1858cabdff1aSopenharmony_ci} 1859cabdff1aSopenharmony_ci#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__) 1860cabdff1aSopenharmony_ci 1861cabdff1aSopenharmony_ci#define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \ 1862cabdff1aSopenharmony_ci{ \ 1863cabdff1aSopenharmony_ci XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1864cabdff1aSopenharmony_ci XORI_B2_128(RTYPE, in4, in5); \ 1865cabdff1aSopenharmony_ci} 1866cabdff1aSopenharmony_ci#define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__) 1867cabdff1aSopenharmony_ci 1868cabdff1aSopenharmony_ci#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ 1869cabdff1aSopenharmony_ci{ \ 1870cabdff1aSopenharmony_ci XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1871cabdff1aSopenharmony_ci XORI_B3_128(RTYPE, in4, in5, in6); \ 1872cabdff1aSopenharmony_ci} 1873cabdff1aSopenharmony_ci#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 1874cabdff1aSopenharmony_ci 1875cabdff1aSopenharmony_ci#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \ 1876cabdff1aSopenharmony_ci{ \ 1877cabdff1aSopenharmony_ci XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1878cabdff1aSopenharmony_ci XORI_B4_128(RTYPE, in4, in5, in6, in7); \ 1879cabdff1aSopenharmony_ci} 1880cabdff1aSopenharmony_ci#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__) 1881cabdff1aSopenharmony_ci#define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__) 1882cabdff1aSopenharmony_ci 1883cabdff1aSopenharmony_ci/* Description : Addition of signed halfword elements and signed saturation 1884cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 1885cabdff1aSopenharmony_ci Outputs - out0, out1 1886cabdff1aSopenharmony_ci Return Type - as per RTYPE 1887cabdff1aSopenharmony_ci Details : Signed halfword elements from 'in0' are added to signed 1888cabdff1aSopenharmony_ci halfword elements of 'in1'. The result is then signed saturated 1889cabdff1aSopenharmony_ci between -32768 to +32767 (as per halfword data type) 1890cabdff1aSopenharmony_ci Similar for other pairs 1891cabdff1aSopenharmony_ci*/ 1892cabdff1aSopenharmony_ci#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1893cabdff1aSopenharmony_ci{ \ 1894cabdff1aSopenharmony_ci out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \ 1895cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \ 1896cabdff1aSopenharmony_ci} 1897cabdff1aSopenharmony_ci#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 1898cabdff1aSopenharmony_ci 1899cabdff1aSopenharmony_ci#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1900cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 1901cabdff1aSopenharmony_ci{ \ 1902cabdff1aSopenharmony_ci ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1903cabdff1aSopenharmony_ci ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1904cabdff1aSopenharmony_ci} 1905cabdff1aSopenharmony_ci#define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__) 1906cabdff1aSopenharmony_ci#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 1907cabdff1aSopenharmony_ci 1908cabdff1aSopenharmony_ci/* Description : Shift left all elements of vector (generic for all data types) 1909cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, shift 1910cabdff1aSopenharmony_ci Outputs - in0, in1, in2, in3 (in place) 1911cabdff1aSopenharmony_ci Return Type - as per input vector RTYPE 1912cabdff1aSopenharmony_ci Details : Each element of vector 'in0' is left shifted by 'shift' and 1913cabdff1aSopenharmony_ci result is in place written to 'in0' 1914cabdff1aSopenharmony_ci Similar for other pairs 1915cabdff1aSopenharmony_ci*/ 1916cabdff1aSopenharmony_ci#define SLLI_2V(in0, in1, shift) \ 1917cabdff1aSopenharmony_ci{ \ 1918cabdff1aSopenharmony_ci in0 = in0 << shift; \ 1919cabdff1aSopenharmony_ci in1 = in1 << shift; \ 1920cabdff1aSopenharmony_ci} 1921cabdff1aSopenharmony_ci#define SLLI_4V(in0, in1, in2, in3, shift) \ 1922cabdff1aSopenharmony_ci{ \ 1923cabdff1aSopenharmony_ci in0 = in0 << shift; \ 1924cabdff1aSopenharmony_ci in1 = in1 << shift; \ 1925cabdff1aSopenharmony_ci in2 = in2 << shift; \ 1926cabdff1aSopenharmony_ci in3 = in3 << shift; \ 1927cabdff1aSopenharmony_ci} 1928cabdff1aSopenharmony_ci 1929cabdff1aSopenharmony_ci/* Description : Arithmetic shift right all elements of vector 1930cabdff1aSopenharmony_ci (generic for all data types) 1931cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, shift 1932cabdff1aSopenharmony_ci Outputs - in0, in1, in2, in3 (in place) 1933cabdff1aSopenharmony_ci Return Type - as per input vector RTYPE 1934cabdff1aSopenharmony_ci Details : Each element of vector 'in0' is right shifted by 'shift' and 1935cabdff1aSopenharmony_ci result is in place written to 'in0' 1936cabdff1aSopenharmony_ci Here, 'shift' is GP variable passed in 1937cabdff1aSopenharmony_ci Similar for other pairs 1938cabdff1aSopenharmony_ci*/ 1939cabdff1aSopenharmony_ci#define SRA_4V(in0, in1, in2, in3, shift) \ 1940cabdff1aSopenharmony_ci{ \ 1941cabdff1aSopenharmony_ci in0 = in0 >> shift; \ 1942cabdff1aSopenharmony_ci in1 = in1 >> shift; \ 1943cabdff1aSopenharmony_ci in2 = in2 >> shift; \ 1944cabdff1aSopenharmony_ci in3 = in3 >> shift; \ 1945cabdff1aSopenharmony_ci} 1946cabdff1aSopenharmony_ci 1947cabdff1aSopenharmony_ci/* Description : Shift right logical all halfword elements of vector 1948cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, shift 1949cabdff1aSopenharmony_ci Outputs - in0, in1, in2, in3 (in place) 1950cabdff1aSopenharmony_ci Return Type - as per RTYPE 1951cabdff1aSopenharmony_ci Details : Each element of vector 'in0' is shifted right logical by 1952cabdff1aSopenharmony_ci number of bits respective element holds in vector 'shift' and 1953cabdff1aSopenharmony_ci result is in place written to 'in0' 1954cabdff1aSopenharmony_ci Here, 'shift' is a vector passed in 1955cabdff1aSopenharmony_ci Similar for other pairs 1956cabdff1aSopenharmony_ci*/ 1957cabdff1aSopenharmony_ci#define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \ 1958cabdff1aSopenharmony_ci{ \ 1959cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \ 1960cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \ 1961cabdff1aSopenharmony_ci in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \ 1962cabdff1aSopenharmony_ci in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \ 1963cabdff1aSopenharmony_ci} 1964cabdff1aSopenharmony_ci#define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__) 1965cabdff1aSopenharmony_ci 1966cabdff1aSopenharmony_ci#define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \ 1967cabdff1aSopenharmony_ci{ \ 1968cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \ 1969cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \ 1970cabdff1aSopenharmony_ci in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \ 1971cabdff1aSopenharmony_ci in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \ 1972cabdff1aSopenharmony_ci} 1973cabdff1aSopenharmony_ci#define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__) 1974cabdff1aSopenharmony_ci#define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__) 1975cabdff1aSopenharmony_ci 1976cabdff1aSopenharmony_ci#define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \ 1977cabdff1aSopenharmony_ci{ \ 1978cabdff1aSopenharmony_ci SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \ 1979cabdff1aSopenharmony_ci SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \ 1980cabdff1aSopenharmony_ci} 1981cabdff1aSopenharmony_ci#define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__) 1982cabdff1aSopenharmony_ci#define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__) 1983cabdff1aSopenharmony_ci 1984cabdff1aSopenharmony_ci/* Description : Shift right arithmetic rounded halfwords 1985cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, shift 1986cabdff1aSopenharmony_ci Outputs - in0, in1, (in place) 1987cabdff1aSopenharmony_ci Return Type - as per RTYPE 1988cabdff1aSopenharmony_ci Details : Each element of vector 'in0' is shifted right arithmetic by 1989cabdff1aSopenharmony_ci number of bits respective element holds in vector 'shift'. 1990cabdff1aSopenharmony_ci The last discarded bit is added to shifted value for rounding 1991cabdff1aSopenharmony_ci and the result is in place written to 'in0' 1992cabdff1aSopenharmony_ci Here, 'shift' is a vector passed in 1993cabdff1aSopenharmony_ci Similar for other pairs 1994cabdff1aSopenharmony_ci*/ 1995cabdff1aSopenharmony_ci#define SRAR_H2(RTYPE, in0, in1, shift) \ 1996cabdff1aSopenharmony_ci{ \ 1997cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \ 1998cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \ 1999cabdff1aSopenharmony_ci} 2000cabdff1aSopenharmony_ci#define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__) 2001cabdff1aSopenharmony_ci#define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__) 2002cabdff1aSopenharmony_ci 2003cabdff1aSopenharmony_ci#define SRAR_H3(RTYPE, in0, in1, in2, shift) \ 2004cabdff1aSopenharmony_ci{ \ 2005cabdff1aSopenharmony_ci SRAR_H2(RTYPE, in0, in1, shift) \ 2006cabdff1aSopenharmony_ci in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \ 2007cabdff1aSopenharmony_ci} 2008cabdff1aSopenharmony_ci#define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__) 2009cabdff1aSopenharmony_ci 2010cabdff1aSopenharmony_ci#define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \ 2011cabdff1aSopenharmony_ci{ \ 2012cabdff1aSopenharmony_ci SRAR_H2(RTYPE, in0, in1, shift) \ 2013cabdff1aSopenharmony_ci SRAR_H2(RTYPE, in2, in3, shift) \ 2014cabdff1aSopenharmony_ci} 2015cabdff1aSopenharmony_ci#define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__) 2016cabdff1aSopenharmony_ci#define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__) 2017cabdff1aSopenharmony_ci 2018cabdff1aSopenharmony_ci/* Description : Shift right arithmetic rounded words 2019cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, shift 2020cabdff1aSopenharmony_ci Outputs - in0, in1, (in place) 2021cabdff1aSopenharmony_ci Return Type - as per RTYPE 2022cabdff1aSopenharmony_ci Details : Each element of vector 'in0' is shifted right arithmetic by 2023cabdff1aSopenharmony_ci number of bits respective element holds in vector 'shift'. 2024cabdff1aSopenharmony_ci The last discarded bit is added to shifted value for rounding 2025cabdff1aSopenharmony_ci and the result is in place written to 'in0' 2026cabdff1aSopenharmony_ci Here, 'shift' is a vector passed in 2027cabdff1aSopenharmony_ci Similar for other pairs 2028cabdff1aSopenharmony_ci*/ 2029cabdff1aSopenharmony_ci#define SRAR_W2(RTYPE, in0, in1, shift) \ 2030cabdff1aSopenharmony_ci{ \ 2031cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \ 2032cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \ 2033cabdff1aSopenharmony_ci} 2034cabdff1aSopenharmony_ci#define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__) 2035cabdff1aSopenharmony_ci 2036cabdff1aSopenharmony_ci#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 2037cabdff1aSopenharmony_ci{ \ 2038cabdff1aSopenharmony_ci SRAR_W2(RTYPE, in0, in1, shift) \ 2039cabdff1aSopenharmony_ci SRAR_W2(RTYPE, in2, in3, shift) \ 2040cabdff1aSopenharmony_ci} 2041cabdff1aSopenharmony_ci#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 2042cabdff1aSopenharmony_ci 2043cabdff1aSopenharmony_ci/* Description : Shift right arithmetic rounded (immediate) 2044cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, shift 2045cabdff1aSopenharmony_ci Outputs - in0, in1, in2, in3 (in place) 2046cabdff1aSopenharmony_ci Return Type - as per RTYPE 2047cabdff1aSopenharmony_ci Details : Each element of vector 'in0' is shifted right arithmetic by 2048cabdff1aSopenharmony_ci value in 'shift'. 2049cabdff1aSopenharmony_ci The last discarded bit is added to shifted value for rounding 2050cabdff1aSopenharmony_ci and the result is in place written to 'in0' 2051cabdff1aSopenharmony_ci Similar for other pairs 2052cabdff1aSopenharmony_ci*/ 2053cabdff1aSopenharmony_ci#define SRARI_H2(RTYPE, in0, in1, shift) \ 2054cabdff1aSopenharmony_ci{ \ 2055cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \ 2056cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \ 2057cabdff1aSopenharmony_ci} 2058cabdff1aSopenharmony_ci#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 2059cabdff1aSopenharmony_ci#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 2060cabdff1aSopenharmony_ci 2061cabdff1aSopenharmony_ci#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 2062cabdff1aSopenharmony_ci{ \ 2063cabdff1aSopenharmony_ci SRARI_H2(RTYPE, in0, in1, shift); \ 2064cabdff1aSopenharmony_ci SRARI_H2(RTYPE, in2, in3, shift); \ 2065cabdff1aSopenharmony_ci} 2066cabdff1aSopenharmony_ci#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 2067cabdff1aSopenharmony_ci#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 2068cabdff1aSopenharmony_ci 2069cabdff1aSopenharmony_ci/* Description : Shift right arithmetic rounded (immediate) 2070cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, shift 2071cabdff1aSopenharmony_ci Outputs - in0, in1 (in place) 2072cabdff1aSopenharmony_ci Return Type - as per RTYPE 2073cabdff1aSopenharmony_ci Details : Each element of vector 'in0' is shifted right arithmetic by 2074cabdff1aSopenharmony_ci value in 'shift'. 2075cabdff1aSopenharmony_ci The last discarded bit is added to shifted value for rounding 2076cabdff1aSopenharmony_ci and the result is in place written to 'in0' 2077cabdff1aSopenharmony_ci Similar for other pairs 2078cabdff1aSopenharmony_ci*/ 2079cabdff1aSopenharmony_ci#define SRARI_W2(RTYPE, in0, in1, shift) \ 2080cabdff1aSopenharmony_ci{ \ 2081cabdff1aSopenharmony_ci in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \ 2082cabdff1aSopenharmony_ci in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \ 2083cabdff1aSopenharmony_ci} 2084cabdff1aSopenharmony_ci#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 2085cabdff1aSopenharmony_ci 2086cabdff1aSopenharmony_ci#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 2087cabdff1aSopenharmony_ci{ \ 2088cabdff1aSopenharmony_ci SRARI_W2(RTYPE, in0, in1, shift); \ 2089cabdff1aSopenharmony_ci SRARI_W2(RTYPE, in2, in3, shift); \ 2090cabdff1aSopenharmony_ci} 2091cabdff1aSopenharmony_ci#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__) 2092cabdff1aSopenharmony_ci#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 2093cabdff1aSopenharmony_ci 2094cabdff1aSopenharmony_ci/* Description : Multiplication of pairs of vectors 2095cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 2096cabdff1aSopenharmony_ci Outputs - out0, out1 2097cabdff1aSopenharmony_ci Details : Each element from 'in0' is multiplied with elements from 'in1' 2098cabdff1aSopenharmony_ci and result is written to 'out0' 2099cabdff1aSopenharmony_ci Similar for other pairs 2100cabdff1aSopenharmony_ci*/ 2101cabdff1aSopenharmony_ci#define MUL2(in0, in1, in2, in3, out0, out1) \ 2102cabdff1aSopenharmony_ci{ \ 2103cabdff1aSopenharmony_ci out0 = in0 * in1; \ 2104cabdff1aSopenharmony_ci out1 = in2 * in3; \ 2105cabdff1aSopenharmony_ci} 2106cabdff1aSopenharmony_ci#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 2107cabdff1aSopenharmony_ci{ \ 2108cabdff1aSopenharmony_ci MUL2(in0, in1, in2, in3, out0, out1); \ 2109cabdff1aSopenharmony_ci MUL2(in4, in5, in6, in7, out2, out3); \ 2110cabdff1aSopenharmony_ci} 2111cabdff1aSopenharmony_ci 2112cabdff1aSopenharmony_ci/* Description : Addition of 2 pairs of vectors 2113cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 2114cabdff1aSopenharmony_ci Outputs - out0, out1 2115cabdff1aSopenharmony_ci Details : Each element from 2 pairs vectors is added and 2 results are 2116cabdff1aSopenharmony_ci produced 2117cabdff1aSopenharmony_ci*/ 2118cabdff1aSopenharmony_ci#define ADD2(in0, in1, in2, in3, out0, out1) \ 2119cabdff1aSopenharmony_ci{ \ 2120cabdff1aSopenharmony_ci out0 = in0 + in1; \ 2121cabdff1aSopenharmony_ci out1 = in2 + in3; \ 2122cabdff1aSopenharmony_ci} 2123cabdff1aSopenharmony_ci#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 2124cabdff1aSopenharmony_ci{ \ 2125cabdff1aSopenharmony_ci ADD2(in0, in1, in2, in3, out0, out1); \ 2126cabdff1aSopenharmony_ci ADD2(in4, in5, in6, in7, out2, out3); \ 2127cabdff1aSopenharmony_ci} 2128cabdff1aSopenharmony_ci 2129cabdff1aSopenharmony_ci/* Description : Subtraction of 2 pairs of vectors 2130cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 2131cabdff1aSopenharmony_ci Outputs - out0, out1 2132cabdff1aSopenharmony_ci Details : Each element from 2 pairs vectors is subtracted and 2 results 2133cabdff1aSopenharmony_ci are produced 2134cabdff1aSopenharmony_ci*/ 2135cabdff1aSopenharmony_ci#define SUB2(in0, in1, in2, in3, out0, out1) \ 2136cabdff1aSopenharmony_ci{ \ 2137cabdff1aSopenharmony_ci out0 = in0 - in1; \ 2138cabdff1aSopenharmony_ci out1 = in2 - in3; \ 2139cabdff1aSopenharmony_ci} 2140cabdff1aSopenharmony_ci#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 2141cabdff1aSopenharmony_ci{ \ 2142cabdff1aSopenharmony_ci out0 = in0 - in1; \ 2143cabdff1aSopenharmony_ci out1 = in2 - in3; \ 2144cabdff1aSopenharmony_ci out2 = in4 - in5; \ 2145cabdff1aSopenharmony_ci out3 = in6 - in7; \ 2146cabdff1aSopenharmony_ci} 2147cabdff1aSopenharmony_ci 2148cabdff1aSopenharmony_ci/* Description : Sign extend byte elements from right half of the vector 2149cabdff1aSopenharmony_ci Arguments : Input - in (byte vector) 2150cabdff1aSopenharmony_ci Output - out (sign extended halfword vector) 2151cabdff1aSopenharmony_ci Return Type - signed halfword 2152cabdff1aSopenharmony_ci Details : Sign bit of byte elements from input vector 'in' is 2153cabdff1aSopenharmony_ci extracted and interleaved with same vector 'in' to generate 2154cabdff1aSopenharmony_ci 8 halfword elements keeping sign intact 2155cabdff1aSopenharmony_ci*/ 2156cabdff1aSopenharmony_ci#define UNPCK_R_SB_SH(in, out) \ 2157cabdff1aSopenharmony_ci{ \ 2158cabdff1aSopenharmony_ci v16i8 sign_m; \ 2159cabdff1aSopenharmony_ci \ 2160cabdff1aSopenharmony_ci sign_m = __msa_clti_s_b((v16i8) in, 0); \ 2161cabdff1aSopenharmony_ci out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \ 2162cabdff1aSopenharmony_ci} 2163cabdff1aSopenharmony_ci 2164cabdff1aSopenharmony_ci/* Description : Sign extend halfword elements from right half of the vector 2165cabdff1aSopenharmony_ci Arguments : Inputs - in (input halfword vector) 2166cabdff1aSopenharmony_ci Outputs - out (sign extended word vectors) 2167cabdff1aSopenharmony_ci Return Type - signed word 2168cabdff1aSopenharmony_ci Details : Sign bit of halfword elements from input vector 'in' is 2169cabdff1aSopenharmony_ci extracted and interleaved with same vector 'in0' to generate 2170cabdff1aSopenharmony_ci 4 word elements keeping sign intact 2171cabdff1aSopenharmony_ci*/ 2172cabdff1aSopenharmony_ci#define UNPCK_R_SH_SW(in, out) \ 2173cabdff1aSopenharmony_ci{ \ 2174cabdff1aSopenharmony_ci v8i16 sign_m; \ 2175cabdff1aSopenharmony_ci \ 2176cabdff1aSopenharmony_ci sign_m = __msa_clti_s_h((v8i16) in, 0); \ 2177cabdff1aSopenharmony_ci out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \ 2178cabdff1aSopenharmony_ci} 2179cabdff1aSopenharmony_ci 2180cabdff1aSopenharmony_ci/* Description : Sign extend byte elements from input vector and return 2181cabdff1aSopenharmony_ci halfword results in pair of vectors 2182cabdff1aSopenharmony_ci Arguments : Inputs - in (1 input byte vector) 2183cabdff1aSopenharmony_ci Outputs - out0, out1 (sign extended 2 halfword vectors) 2184cabdff1aSopenharmony_ci Return Type - signed halfword 2185cabdff1aSopenharmony_ci Details : Sign bit of byte elements from input vector 'in' is 2186cabdff1aSopenharmony_ci extracted and interleaved right with same vector 'in0' to 2187cabdff1aSopenharmony_ci generate 8 signed halfword elements in 'out0' 2188cabdff1aSopenharmony_ci Then interleaved left with same vector 'in0' to 2189cabdff1aSopenharmony_ci generate 8 signed halfword elements in 'out1' 2190cabdff1aSopenharmony_ci*/ 2191cabdff1aSopenharmony_ci#define UNPCK_SB_SH(in, out0, out1) \ 2192cabdff1aSopenharmony_ci{ \ 2193cabdff1aSopenharmony_ci v16i8 tmp_m; \ 2194cabdff1aSopenharmony_ci \ 2195cabdff1aSopenharmony_ci tmp_m = __msa_clti_s_b((v16i8) in, 0); \ 2196cabdff1aSopenharmony_ci ILVRL_B2_SH(tmp_m, in, out0, out1); \ 2197cabdff1aSopenharmony_ci} 2198cabdff1aSopenharmony_ci 2199cabdff1aSopenharmony_ci/* Description : Zero extend unsigned byte elements to halfword elements 2200cabdff1aSopenharmony_ci Arguments : Inputs - in (1 input unsigned byte vector) 2201cabdff1aSopenharmony_ci Outputs - out0, out1 (unsigned 2 halfword vectors) 2202cabdff1aSopenharmony_ci Return Type - signed halfword 2203cabdff1aSopenharmony_ci Details : Zero extended right half of vector is returned in 'out0' 2204cabdff1aSopenharmony_ci Zero extended left half of vector is returned in 'out1' 2205cabdff1aSopenharmony_ci*/ 2206cabdff1aSopenharmony_ci#define UNPCK_UB_SH(in, out0, out1) \ 2207cabdff1aSopenharmony_ci{ \ 2208cabdff1aSopenharmony_ci v16i8 zero_m = { 0 }; \ 2209cabdff1aSopenharmony_ci \ 2210cabdff1aSopenharmony_ci ILVRL_B2_SH(zero_m, in, out0, out1); \ 2211cabdff1aSopenharmony_ci} 2212cabdff1aSopenharmony_ci 2213cabdff1aSopenharmony_ci/* Description : Sign extend halfword elements from input vector and return 2214cabdff1aSopenharmony_ci result in pair of vectors 2215cabdff1aSopenharmony_ci Arguments : Inputs - in (1 input halfword vector) 2216cabdff1aSopenharmony_ci Outputs - out0, out1 (sign extended 2 word vectors) 2217cabdff1aSopenharmony_ci Return Type - signed word 2218cabdff1aSopenharmony_ci Details : Sign bit of halfword elements from input vector 'in' is 2219cabdff1aSopenharmony_ci extracted and interleaved right with same vector 'in0' to 2220cabdff1aSopenharmony_ci generate 4 signed word elements in 'out0' 2221cabdff1aSopenharmony_ci Then interleaved left with same vector 'in0' to 2222cabdff1aSopenharmony_ci generate 4 signed word elements in 'out1' 2223cabdff1aSopenharmony_ci*/ 2224cabdff1aSopenharmony_ci#define UNPCK_SH_SW(in, out0, out1) \ 2225cabdff1aSopenharmony_ci{ \ 2226cabdff1aSopenharmony_ci v8i16 tmp_m; \ 2227cabdff1aSopenharmony_ci \ 2228cabdff1aSopenharmony_ci tmp_m = __msa_clti_s_h((v8i16) in, 0); \ 2229cabdff1aSopenharmony_ci ILVRL_H2_SW(tmp_m, in, out0, out1); \ 2230cabdff1aSopenharmony_ci} 2231cabdff1aSopenharmony_ci 2232cabdff1aSopenharmony_ci/* Description : Swap two variables 2233cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 2234cabdff1aSopenharmony_ci Outputs - in0, in1 (in-place) 2235cabdff1aSopenharmony_ci Details : Swapping of two input variables using xor 2236cabdff1aSopenharmony_ci*/ 2237cabdff1aSopenharmony_ci#define SWAP(in0, in1) \ 2238cabdff1aSopenharmony_ci{ \ 2239cabdff1aSopenharmony_ci in0 = in0 ^ in1; \ 2240cabdff1aSopenharmony_ci in1 = in0 ^ in1; \ 2241cabdff1aSopenharmony_ci in0 = in0 ^ in1; \ 2242cabdff1aSopenharmony_ci} 2243cabdff1aSopenharmony_ci 2244cabdff1aSopenharmony_ci/* Description : Butterfly of 4 input vectors 2245cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 2246cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 2247cabdff1aSopenharmony_ci Details : Butterfly operation 2248cabdff1aSopenharmony_ci*/ 2249cabdff1aSopenharmony_ci#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 2250cabdff1aSopenharmony_ci{ \ 2251cabdff1aSopenharmony_ci out0 = in0 + in3; \ 2252cabdff1aSopenharmony_ci out1 = in1 + in2; \ 2253cabdff1aSopenharmony_ci \ 2254cabdff1aSopenharmony_ci out2 = in1 - in2; \ 2255cabdff1aSopenharmony_ci out3 = in0 - in3; \ 2256cabdff1aSopenharmony_ci} 2257cabdff1aSopenharmony_ci 2258cabdff1aSopenharmony_ci/* Description : Butterfly of 8 input vectors 2259cabdff1aSopenharmony_ci Arguments : Inputs - in0 ... in7 2260cabdff1aSopenharmony_ci Outputs - out0 .. out7 2261cabdff1aSopenharmony_ci Details : Butterfly operation 2262cabdff1aSopenharmony_ci*/ 2263cabdff1aSopenharmony_ci#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ 2264cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 2265cabdff1aSopenharmony_ci{ \ 2266cabdff1aSopenharmony_ci out0 = in0 + in7; \ 2267cabdff1aSopenharmony_ci out1 = in1 + in6; \ 2268cabdff1aSopenharmony_ci out2 = in2 + in5; \ 2269cabdff1aSopenharmony_ci out3 = in3 + in4; \ 2270cabdff1aSopenharmony_ci \ 2271cabdff1aSopenharmony_ci out4 = in3 - in4; \ 2272cabdff1aSopenharmony_ci out5 = in2 - in5; \ 2273cabdff1aSopenharmony_ci out6 = in1 - in6; \ 2274cabdff1aSopenharmony_ci out7 = in0 - in7; \ 2275cabdff1aSopenharmony_ci} 2276cabdff1aSopenharmony_ci 2277cabdff1aSopenharmony_ci/* Description : Butterfly of 16 input vectors 2278cabdff1aSopenharmony_ci Arguments : Inputs - in0 ... in15 2279cabdff1aSopenharmony_ci Outputs - out0 .. out15 2280cabdff1aSopenharmony_ci Details : Butterfly operation 2281cabdff1aSopenharmony_ci*/ 2282cabdff1aSopenharmony_ci#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \ 2283cabdff1aSopenharmony_ci in8, in9, in10, in11, in12, in13, in14, in15, \ 2284cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7, \ 2285cabdff1aSopenharmony_ci out8, out9, out10, out11, out12, out13, out14, out15) \ 2286cabdff1aSopenharmony_ci{ \ 2287cabdff1aSopenharmony_ci out0 = in0 + in15; \ 2288cabdff1aSopenharmony_ci out1 = in1 + in14; \ 2289cabdff1aSopenharmony_ci out2 = in2 + in13; \ 2290cabdff1aSopenharmony_ci out3 = in3 + in12; \ 2291cabdff1aSopenharmony_ci out4 = in4 + in11; \ 2292cabdff1aSopenharmony_ci out5 = in5 + in10; \ 2293cabdff1aSopenharmony_ci out6 = in6 + in9; \ 2294cabdff1aSopenharmony_ci out7 = in7 + in8; \ 2295cabdff1aSopenharmony_ci \ 2296cabdff1aSopenharmony_ci out8 = in7 - in8; \ 2297cabdff1aSopenharmony_ci out9 = in6 - in9; \ 2298cabdff1aSopenharmony_ci out10 = in5 - in10; \ 2299cabdff1aSopenharmony_ci out11 = in4 - in11; \ 2300cabdff1aSopenharmony_ci out12 = in3 - in12; \ 2301cabdff1aSopenharmony_ci out13 = in2 - in13; \ 2302cabdff1aSopenharmony_ci out14 = in1 - in14; \ 2303cabdff1aSopenharmony_ci out15 = in0 - in15; \ 2304cabdff1aSopenharmony_ci} 2305cabdff1aSopenharmony_ci 2306cabdff1aSopenharmony_ci/* Description : Transposes input 4x4 byte block 2307cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block) 2308cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 (output 4x4 byte block) 2309cabdff1aSopenharmony_ci Return Type - unsigned byte 2310cabdff1aSopenharmony_ci Details : 2311cabdff1aSopenharmony_ci*/ 2312cabdff1aSopenharmony_ci#define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \ 2313cabdff1aSopenharmony_ci{ \ 2314cabdff1aSopenharmony_ci v16i8 zero_m = { 0 }; \ 2315cabdff1aSopenharmony_ci v16i8 s0_m, s1_m, s2_m, s3_m; \ 2316cabdff1aSopenharmony_ci \ 2317cabdff1aSopenharmony_ci ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \ 2318cabdff1aSopenharmony_ci ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \ 2319cabdff1aSopenharmony_ci \ 2320cabdff1aSopenharmony_ci out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \ 2321cabdff1aSopenharmony_ci out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \ 2322cabdff1aSopenharmony_ci out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \ 2323cabdff1aSopenharmony_ci out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \ 2324cabdff1aSopenharmony_ci} 2325cabdff1aSopenharmony_ci 2326cabdff1aSopenharmony_ci/* Description : Transposes input 8x4 byte block into 4x8 2327cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block) 2328cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 (output 4x8 byte block) 2329cabdff1aSopenharmony_ci Return Type - as per RTYPE 2330cabdff1aSopenharmony_ci Details : 2331cabdff1aSopenharmony_ci*/ 2332cabdff1aSopenharmony_ci#define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2333cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 2334cabdff1aSopenharmony_ci{ \ 2335cabdff1aSopenharmony_ci v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2336cabdff1aSopenharmony_ci \ 2337cabdff1aSopenharmony_ci ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \ 2338cabdff1aSopenharmony_ci tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ 2339cabdff1aSopenharmony_ci ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \ 2340cabdff1aSopenharmony_ci \ 2341cabdff1aSopenharmony_ci tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ 2342cabdff1aSopenharmony_ci ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \ 2343cabdff1aSopenharmony_ci \ 2344cabdff1aSopenharmony_ci ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \ 2345cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \ 2346cabdff1aSopenharmony_ci out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \ 2347cabdff1aSopenharmony_ci} 2348cabdff1aSopenharmony_ci#define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__) 2349cabdff1aSopenharmony_ci#define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__) 2350cabdff1aSopenharmony_ci 2351cabdff1aSopenharmony_ci/* Description : Transposes input 8x8 byte block 2352cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 2353cabdff1aSopenharmony_ci (input 8x8 byte block) 2354cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2355cabdff1aSopenharmony_ci (output 8x8 byte block) 2356cabdff1aSopenharmony_ci Return Type - as per RTYPE 2357cabdff1aSopenharmony_ci Details : 2358cabdff1aSopenharmony_ci*/ 2359cabdff1aSopenharmony_ci#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2360cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 2361cabdff1aSopenharmony_ci{ \ 2362cabdff1aSopenharmony_ci v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2363cabdff1aSopenharmony_ci v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 2364cabdff1aSopenharmony_ci v16i8 zeros = { 0 }; \ 2365cabdff1aSopenharmony_ci \ 2366cabdff1aSopenharmony_ci ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ 2367cabdff1aSopenharmony_ci tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 2368cabdff1aSopenharmony_ci ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 2369cabdff1aSopenharmony_ci ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 2370cabdff1aSopenharmony_ci ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 2371cabdff1aSopenharmony_ci ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 2372cabdff1aSopenharmony_ci SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6, \ 2373cabdff1aSopenharmony_ci 8, out1, out3, out5, out7); \ 2374cabdff1aSopenharmony_ci} 2375cabdff1aSopenharmony_ci#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 2376cabdff1aSopenharmony_ci#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__) 2377cabdff1aSopenharmony_ci 2378cabdff1aSopenharmony_ci/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors 2379cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 2380cabdff1aSopenharmony_ci in8, in9, in10, in11, in12, in13, in14, in15 2381cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 2382cabdff1aSopenharmony_ci Return Type - unsigned byte 2383cabdff1aSopenharmony_ci Details : 2384cabdff1aSopenharmony_ci*/ 2385cabdff1aSopenharmony_ci#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2386cabdff1aSopenharmony_ci in8, in9, in10, in11, in12, in13, in14, in15, \ 2387cabdff1aSopenharmony_ci out0, out1, out2, out3) \ 2388cabdff1aSopenharmony_ci{ \ 2389cabdff1aSopenharmony_ci v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2390cabdff1aSopenharmony_ci \ 2391cabdff1aSopenharmony_ci ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \ 2392cabdff1aSopenharmony_ci out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \ 2393cabdff1aSopenharmony_ci \ 2394cabdff1aSopenharmony_ci ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \ 2395cabdff1aSopenharmony_ci out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \ 2396cabdff1aSopenharmony_ci \ 2397cabdff1aSopenharmony_ci ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \ 2398cabdff1aSopenharmony_ci \ 2399cabdff1aSopenharmony_ci tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ 2400cabdff1aSopenharmony_ci ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \ 2401cabdff1aSopenharmony_ci \ 2402cabdff1aSopenharmony_ci tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ 2403cabdff1aSopenharmony_ci ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ 2404cabdff1aSopenharmony_ci out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2405cabdff1aSopenharmony_ci out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2406cabdff1aSopenharmony_ci \ 2407cabdff1aSopenharmony_ci tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \ 2408cabdff1aSopenharmony_ci tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \ 2409cabdff1aSopenharmony_ci out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2410cabdff1aSopenharmony_ci out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2411cabdff1aSopenharmony_ci} 2412cabdff1aSopenharmony_ci 2413cabdff1aSopenharmony_ci/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors 2414cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 2415cabdff1aSopenharmony_ci in8, in9, in10, in11, in12, in13, in14, in15 2416cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2417cabdff1aSopenharmony_ci Return Type - unsigned byte 2418cabdff1aSopenharmony_ci Details : 2419cabdff1aSopenharmony_ci*/ 2420cabdff1aSopenharmony_ci#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2421cabdff1aSopenharmony_ci in8, in9, in10, in11, in12, in13, in14, in15, \ 2422cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 2423cabdff1aSopenharmony_ci{ \ 2424cabdff1aSopenharmony_ci v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2425cabdff1aSopenharmony_ci v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 2426cabdff1aSopenharmony_ci \ 2427cabdff1aSopenharmony_ci ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 2428cabdff1aSopenharmony_ci ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 2429cabdff1aSopenharmony_ci ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 2430cabdff1aSopenharmony_ci ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 2431cabdff1aSopenharmony_ci \ 2432cabdff1aSopenharmony_ci tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \ 2433cabdff1aSopenharmony_ci tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \ 2434cabdff1aSopenharmony_ci tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \ 2435cabdff1aSopenharmony_ci tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \ 2436cabdff1aSopenharmony_ci out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \ 2437cabdff1aSopenharmony_ci tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \ 2438cabdff1aSopenharmony_ci out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \ 2439cabdff1aSopenharmony_ci tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \ 2440cabdff1aSopenharmony_ci \ 2441cabdff1aSopenharmony_ci ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 2442cabdff1aSopenharmony_ci out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2443cabdff1aSopenharmony_ci out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2444cabdff1aSopenharmony_ci \ 2445cabdff1aSopenharmony_ci tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ 2446cabdff1aSopenharmony_ci tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \ 2447cabdff1aSopenharmony_ci out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2448cabdff1aSopenharmony_ci out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2449cabdff1aSopenharmony_ci \ 2450cabdff1aSopenharmony_ci ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 2451cabdff1aSopenharmony_ci out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2452cabdff1aSopenharmony_ci out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2453cabdff1aSopenharmony_ci \ 2454cabdff1aSopenharmony_ci tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \ 2455cabdff1aSopenharmony_ci tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \ 2456cabdff1aSopenharmony_ci out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2457cabdff1aSopenharmony_ci out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ 2458cabdff1aSopenharmony_ci} 2459cabdff1aSopenharmony_ci 2460cabdff1aSopenharmony_ci/* Description : Transposes 4x4 block with half word elements in vectors 2461cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 2462cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 2463cabdff1aSopenharmony_ci Return Type - signed halfword 2464cabdff1aSopenharmony_ci Details : 2465cabdff1aSopenharmony_ci*/ 2466cabdff1aSopenharmony_ci#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 2467cabdff1aSopenharmony_ci{ \ 2468cabdff1aSopenharmony_ci v8i16 s0_m, s1_m; \ 2469cabdff1aSopenharmony_ci \ 2470cabdff1aSopenharmony_ci ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 2471cabdff1aSopenharmony_ci ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 2472cabdff1aSopenharmony_ci out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \ 2473cabdff1aSopenharmony_ci out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \ 2474cabdff1aSopenharmony_ci} 2475cabdff1aSopenharmony_ci 2476cabdff1aSopenharmony_ci/* Description : Transposes 8x8 block with half word elements in vectors 2477cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 2478cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3, out4, out5, out6, out7 2479cabdff1aSopenharmony_ci Return Type - as per RTYPE 2480cabdff1aSopenharmony_ci Details : 2481cabdff1aSopenharmony_ci*/ 2482cabdff1aSopenharmony_ci#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 2483cabdff1aSopenharmony_ci out0, out1, out2, out3, out4, out5, out6, out7) \ 2484cabdff1aSopenharmony_ci{ \ 2485cabdff1aSopenharmony_ci v8i16 s0_m, s1_m; \ 2486cabdff1aSopenharmony_ci v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2487cabdff1aSopenharmony_ci v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 2488cabdff1aSopenharmony_ci \ 2489cabdff1aSopenharmony_ci ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 2490cabdff1aSopenharmony_ci ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ 2491cabdff1aSopenharmony_ci ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 2492cabdff1aSopenharmony_ci ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ 2493cabdff1aSopenharmony_ci ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 2494cabdff1aSopenharmony_ci ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ 2495cabdff1aSopenharmony_ci ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 2496cabdff1aSopenharmony_ci ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ 2497cabdff1aSopenharmony_ci PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ 2498cabdff1aSopenharmony_ci tmp3_m, tmp7_m, out0, out2, out4, out6); \ 2499cabdff1aSopenharmony_ci out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \ 2500cabdff1aSopenharmony_ci out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \ 2501cabdff1aSopenharmony_ci out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \ 2502cabdff1aSopenharmony_ci out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \ 2503cabdff1aSopenharmony_ci} 2504cabdff1aSopenharmony_ci#define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__) 2505cabdff1aSopenharmony_ci#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 2506cabdff1aSopenharmony_ci 2507cabdff1aSopenharmony_ci/* Description : Transposes 4x4 block with word elements in vectors 2508cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3 2509cabdff1aSopenharmony_ci Outputs - out0, out1, out2, out3 2510cabdff1aSopenharmony_ci Return Type - signed word 2511cabdff1aSopenharmony_ci Details : 2512cabdff1aSopenharmony_ci*/ 2513cabdff1aSopenharmony_ci#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 2514cabdff1aSopenharmony_ci{ \ 2515cabdff1aSopenharmony_ci v4i32 s0_m, s1_m, s2_m, s3_m; \ 2516cabdff1aSopenharmony_ci \ 2517cabdff1aSopenharmony_ci ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 2518cabdff1aSopenharmony_ci ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 2519cabdff1aSopenharmony_ci \ 2520cabdff1aSopenharmony_ci out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \ 2521cabdff1aSopenharmony_ci out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \ 2522cabdff1aSopenharmony_ci out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ 2523cabdff1aSopenharmony_ci out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ 2524cabdff1aSopenharmony_ci} 2525cabdff1aSopenharmony_ci 2526cabdff1aSopenharmony_ci/* Description : Average byte elements from pair of vectors and store 8x4 byte 2527cabdff1aSopenharmony_ci block in destination memory 2528cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2529cabdff1aSopenharmony_ci Details : Each byte element from input vector pair 'in0' and 'in1' are 2530cabdff1aSopenharmony_ci averaged (a + b)/2 and stored in 'tmp0_m' 2531cabdff1aSopenharmony_ci Each byte element from input vector pair 'in2' and 'in3' are 2532cabdff1aSopenharmony_ci averaged (a + b)/2 and stored in 'tmp1_m' 2533cabdff1aSopenharmony_ci Each byte element from input vector pair 'in4' and 'in5' are 2534cabdff1aSopenharmony_ci averaged (a + b)/2 and stored in 'tmp2_m' 2535cabdff1aSopenharmony_ci Each byte element from input vector pair 'in6' and 'in7' are 2536cabdff1aSopenharmony_ci averaged (a + b)/2 and stored in 'tmp3_m' 2537cabdff1aSopenharmony_ci The half vector results from all 4 vectors are stored in 2538cabdff1aSopenharmony_ci destination memory as 8x4 byte block 2539cabdff1aSopenharmony_ci*/ 2540cabdff1aSopenharmony_ci#define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 2541cabdff1aSopenharmony_ci{ \ 2542cabdff1aSopenharmony_ci uint64_t out0_m, out1_m, out2_m, out3_m; \ 2543cabdff1aSopenharmony_ci v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2544cabdff1aSopenharmony_ci \ 2545cabdff1aSopenharmony_ci tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \ 2546cabdff1aSopenharmony_ci tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \ 2547cabdff1aSopenharmony_ci tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \ 2548cabdff1aSopenharmony_ci tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \ 2549cabdff1aSopenharmony_ci \ 2550cabdff1aSopenharmony_ci out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \ 2551cabdff1aSopenharmony_ci out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \ 2552cabdff1aSopenharmony_ci out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \ 2553cabdff1aSopenharmony_ci out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \ 2554cabdff1aSopenharmony_ci SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ 2555cabdff1aSopenharmony_ci} 2556cabdff1aSopenharmony_ci 2557cabdff1aSopenharmony_ci/* Description : Average byte elements from pair of vectors and store 16x4 byte 2558cabdff1aSopenharmony_ci block in destination memory 2559cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2560cabdff1aSopenharmony_ci Details : Each byte element from input vector pair 'in0' and 'in1' are 2561cabdff1aSopenharmony_ci averaged (a + b)/2 and stored in 'tmp0_m' 2562cabdff1aSopenharmony_ci Each byte element from input vector pair 'in2' and 'in3' are 2563cabdff1aSopenharmony_ci averaged (a + b)/2 and stored in 'tmp1_m' 2564cabdff1aSopenharmony_ci Each byte element from input vector pair 'in4' and 'in5' are 2565cabdff1aSopenharmony_ci averaged (a + b)/2 and stored in 'tmp2_m' 2566cabdff1aSopenharmony_ci Each byte element from input vector pair 'in6' and 'in7' are 2567cabdff1aSopenharmony_ci averaged (a + b)/2 and stored in 'tmp3_m' 2568cabdff1aSopenharmony_ci The results from all 4 vectors are stored in destination 2569cabdff1aSopenharmony_ci memory as 16x4 byte block 2570cabdff1aSopenharmony_ci*/ 2571cabdff1aSopenharmony_ci#define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 2572cabdff1aSopenharmony_ci{ \ 2573cabdff1aSopenharmony_ci v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2574cabdff1aSopenharmony_ci \ 2575cabdff1aSopenharmony_ci tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \ 2576cabdff1aSopenharmony_ci tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \ 2577cabdff1aSopenharmony_ci tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \ 2578cabdff1aSopenharmony_ci tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \ 2579cabdff1aSopenharmony_ci \ 2580cabdff1aSopenharmony_ci ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \ 2581cabdff1aSopenharmony_ci} 2582cabdff1aSopenharmony_ci 2583cabdff1aSopenharmony_ci/* Description : Average rounded byte elements from pair of vectors and store 2584cabdff1aSopenharmony_ci 8x4 byte block in destination memory 2585cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2586cabdff1aSopenharmony_ci Details : Each byte element from input vector pair 'in0' and 'in1' are 2587cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp0_m' 2588cabdff1aSopenharmony_ci Each byte element from input vector pair 'in2' and 'in3' are 2589cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp1_m' 2590cabdff1aSopenharmony_ci Each byte element from input vector pair 'in4' and 'in5' are 2591cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp2_m' 2592cabdff1aSopenharmony_ci Each byte element from input vector pair 'in6' and 'in7' are 2593cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp3_m' 2594cabdff1aSopenharmony_ci The half vector results from all 4 vectors are stored in 2595cabdff1aSopenharmony_ci destination memory as 8x4 byte block 2596cabdff1aSopenharmony_ci*/ 2597cabdff1aSopenharmony_ci#define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 2598cabdff1aSopenharmony_ci{ \ 2599cabdff1aSopenharmony_ci uint64_t out0_m, out1_m, out2_m, out3_m; \ 2600cabdff1aSopenharmony_ci v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \ 2601cabdff1aSopenharmony_ci \ 2602cabdff1aSopenharmony_ci AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2603cabdff1aSopenharmony_ci tp0_m, tp1_m, tp2_m, tp3_m); \ 2604cabdff1aSopenharmony_ci \ 2605cabdff1aSopenharmony_ci out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \ 2606cabdff1aSopenharmony_ci out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \ 2607cabdff1aSopenharmony_ci out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \ 2608cabdff1aSopenharmony_ci out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \ 2609cabdff1aSopenharmony_ci SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ 2610cabdff1aSopenharmony_ci} 2611cabdff1aSopenharmony_ci 2612cabdff1aSopenharmony_ci/* Description : Average rounded byte elements from pair of vectors and store 2613cabdff1aSopenharmony_ci 16x4 byte block in destination memory 2614cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2615cabdff1aSopenharmony_ci Details : Each byte element from input vector pair 'in0' and 'in1' are 2616cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp0_m' 2617cabdff1aSopenharmony_ci Each byte element from input vector pair 'in2' and 'in3' are 2618cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp1_m' 2619cabdff1aSopenharmony_ci Each byte element from input vector pair 'in4' and 'in5' are 2620cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp2_m' 2621cabdff1aSopenharmony_ci Each byte element from input vector pair 'in6' and 'in7' are 2622cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp3_m' 2623cabdff1aSopenharmony_ci The vector results from all 4 vectors are stored in 2624cabdff1aSopenharmony_ci destination memory as 16x4 byte block 2625cabdff1aSopenharmony_ci*/ 2626cabdff1aSopenharmony_ci#define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 2627cabdff1aSopenharmony_ci{ \ 2628cabdff1aSopenharmony_ci v16u8 t0_m, t1_m, t2_m, t3_m; \ 2629cabdff1aSopenharmony_ci \ 2630cabdff1aSopenharmony_ci AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2631cabdff1aSopenharmony_ci t0_m, t1_m, t2_m, t3_m); \ 2632cabdff1aSopenharmony_ci ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \ 2633cabdff1aSopenharmony_ci} 2634cabdff1aSopenharmony_ci 2635cabdff1aSopenharmony_ci/* Description : Average rounded byte elements from pair of vectors, 2636cabdff1aSopenharmony_ci average rounded with destination and store 8x4 byte block 2637cabdff1aSopenharmony_ci in destination memory 2638cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2639cabdff1aSopenharmony_ci Details : Each byte element from input vector pair 'in0' and 'in1' are 2640cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp0_m' 2641cabdff1aSopenharmony_ci Each byte element from input vector pair 'in2' and 'in3' are 2642cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp1_m' 2643cabdff1aSopenharmony_ci Each byte element from input vector pair 'in4' and 'in5' are 2644cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp2_m' 2645cabdff1aSopenharmony_ci Each byte element from input vector pair 'in6' and 'in7' are 2646cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp3_m' 2647cabdff1aSopenharmony_ci The half vector results from all 4 vectors are stored in 2648cabdff1aSopenharmony_ci destination memory as 8x4 byte block 2649cabdff1aSopenharmony_ci*/ 2650cabdff1aSopenharmony_ci#define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2651cabdff1aSopenharmony_ci pdst, stride) \ 2652cabdff1aSopenharmony_ci{ \ 2653cabdff1aSopenharmony_ci v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2654cabdff1aSopenharmony_ci v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ 2655cabdff1aSopenharmony_ci \ 2656cabdff1aSopenharmony_ci LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \ 2657cabdff1aSopenharmony_ci AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2658cabdff1aSopenharmony_ci tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 2659cabdff1aSopenharmony_ci AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \ 2660cabdff1aSopenharmony_ci dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \ 2661cabdff1aSopenharmony_ci} 2662cabdff1aSopenharmony_ci 2663cabdff1aSopenharmony_ci/* Description : Average rounded byte elements from pair of vectors, 2664cabdff1aSopenharmony_ci average rounded with destination and store 16x4 byte block 2665cabdff1aSopenharmony_ci in destination memory 2666cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride 2667cabdff1aSopenharmony_ci Details : Each byte element from input vector pair 'in0' and 'in1' are 2668cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp0_m' 2669cabdff1aSopenharmony_ci Each byte element from input vector pair 'in2' and 'in3' are 2670cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp1_m' 2671cabdff1aSopenharmony_ci Each byte element from input vector pair 'in4' and 'in5' are 2672cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp2_m' 2673cabdff1aSopenharmony_ci Each byte element from input vector pair 'in6' and 'in7' are 2674cabdff1aSopenharmony_ci average rounded (a + b + 1)/2 and stored in 'tmp3_m' 2675cabdff1aSopenharmony_ci The vector results from all 4 vectors are stored in 2676cabdff1aSopenharmony_ci destination memory as 16x4 byte block 2677cabdff1aSopenharmony_ci*/ 2678cabdff1aSopenharmony_ci#define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2679cabdff1aSopenharmony_ci pdst, stride) \ 2680cabdff1aSopenharmony_ci{ \ 2681cabdff1aSopenharmony_ci v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2682cabdff1aSopenharmony_ci v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ 2683cabdff1aSopenharmony_ci \ 2684cabdff1aSopenharmony_ci LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \ 2685cabdff1aSopenharmony_ci AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ 2686cabdff1aSopenharmony_ci tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 2687cabdff1aSopenharmony_ci AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \ 2688cabdff1aSopenharmony_ci dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \ 2689cabdff1aSopenharmony_ci} 2690cabdff1aSopenharmony_ci 2691cabdff1aSopenharmony_ci/* Description : Add block 4x4 2692cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, pdst, stride 2693cabdff1aSopenharmony_ci Details : Least significant 4 bytes from each input vector are added to 2694cabdff1aSopenharmony_ci the destination bytes, clipped between 0-255 and then stored. 2695cabdff1aSopenharmony_ci*/ 2696cabdff1aSopenharmony_ci#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 2697cabdff1aSopenharmony_ci{ \ 2698cabdff1aSopenharmony_ci uint32_t src0_m, src1_m, src2_m, src3_m; \ 2699cabdff1aSopenharmony_ci uint32_t out0_m, out1_m, out2_m, out3_m; \ 2700cabdff1aSopenharmony_ci v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 2701cabdff1aSopenharmony_ci v16i8 dst0_m = { 0 }; \ 2702cabdff1aSopenharmony_ci v16i8 dst1_m = { 0 }; \ 2703cabdff1aSopenharmony_ci v16i8 zero_m = { 0 }; \ 2704cabdff1aSopenharmony_ci \ 2705cabdff1aSopenharmony_ci ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ 2706cabdff1aSopenharmony_ci LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 2707cabdff1aSopenharmony_ci INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 2708cabdff1aSopenharmony_ci INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 2709cabdff1aSopenharmony_ci ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 2710cabdff1aSopenharmony_ci ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 2711cabdff1aSopenharmony_ci CLIP_SH2_0_255(res0_m, res1_m); \ 2712cabdff1aSopenharmony_ci PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 2713cabdff1aSopenharmony_ci \ 2714cabdff1aSopenharmony_ci out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \ 2715cabdff1aSopenharmony_ci out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \ 2716cabdff1aSopenharmony_ci out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \ 2717cabdff1aSopenharmony_ci out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \ 2718cabdff1aSopenharmony_ci SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ 2719cabdff1aSopenharmony_ci} 2720cabdff1aSopenharmony_ci 2721cabdff1aSopenharmony_ci/* Description : Dot product and addition of 3 signed halfword input vectors 2722cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2 2723cabdff1aSopenharmony_ci Outputs - out0_m 2724cabdff1aSopenharmony_ci Return Type - signed halfword 2725cabdff1aSopenharmony_ci Details : Dot product of 'in0' with 'coeff0' 2726cabdff1aSopenharmony_ci Dot product of 'in1' with 'coeff1' 2727cabdff1aSopenharmony_ci Dot product of 'in2' with 'coeff2' 2728cabdff1aSopenharmony_ci Addition of all the 3 vector results 2729cabdff1aSopenharmony_ci 2730cabdff1aSopenharmony_ci out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2) 2731cabdff1aSopenharmony_ci*/ 2732cabdff1aSopenharmony_ci#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 2733cabdff1aSopenharmony_ci( { \ 2734cabdff1aSopenharmony_ci v8i16 out0_m; \ 2735cabdff1aSopenharmony_ci \ 2736cabdff1aSopenharmony_ci out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \ 2737cabdff1aSopenharmony_ci out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \ 2738cabdff1aSopenharmony_ci out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \ 2739cabdff1aSopenharmony_ci \ 2740cabdff1aSopenharmony_ci out0_m; \ 2741cabdff1aSopenharmony_ci} ) 2742cabdff1aSopenharmony_ci 2743cabdff1aSopenharmony_ci/* Description : Pack even elements of input vectors & xor with 128 2744cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1 2745cabdff1aSopenharmony_ci Outputs - out_m 2746cabdff1aSopenharmony_ci Return Type - unsigned byte 2747cabdff1aSopenharmony_ci Details : Signed byte even elements from 'in0' and 'in1' are packed 2748cabdff1aSopenharmony_ci together in one vector and the resulted vector is xor'ed with 2749cabdff1aSopenharmony_ci 128 to shift the range from signed to unsigned byte 2750cabdff1aSopenharmony_ci*/ 2751cabdff1aSopenharmony_ci#define PCKEV_XORI128_UB(in0, in1) \ 2752cabdff1aSopenharmony_ci( { \ 2753cabdff1aSopenharmony_ci v16u8 out_m; \ 2754cabdff1aSopenharmony_ci out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \ 2755cabdff1aSopenharmony_ci out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \ 2756cabdff1aSopenharmony_ci out_m; \ 2757cabdff1aSopenharmony_ci} ) 2758cabdff1aSopenharmony_ci 2759cabdff1aSopenharmony_ci/* Description : Converts inputs to unsigned bytes, interleave, average & store 2760cabdff1aSopenharmony_ci as 8x4 unsigned byte block 2761cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride 2762cabdff1aSopenharmony_ci*/ 2763cabdff1aSopenharmony_ci#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ 2764cabdff1aSopenharmony_ci dst0, dst1, pdst, stride) \ 2765cabdff1aSopenharmony_ci{ \ 2766cabdff1aSopenharmony_ci v16u8 tmp0_m, tmp1_m; \ 2767cabdff1aSopenharmony_ci uint8_t *pdst_m = (uint8_t *) (pdst); \ 2768cabdff1aSopenharmony_ci \ 2769cabdff1aSopenharmony_ci tmp0_m = PCKEV_XORI128_UB(in0, in1); \ 2770cabdff1aSopenharmony_ci tmp1_m = PCKEV_XORI128_UB(in2, in3); \ 2771cabdff1aSopenharmony_ci AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ 2772cabdff1aSopenharmony_ci ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \ 2773cabdff1aSopenharmony_ci} 2774cabdff1aSopenharmony_ci 2775cabdff1aSopenharmony_ci/* Description : Pack even byte elements, extract 0 & 2 index words from pair 2776cabdff1aSopenharmony_ci of results and store 4 words in destination memory as per 2777cabdff1aSopenharmony_ci stride 2778cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, in2, in3, pdst, stride 2779cabdff1aSopenharmony_ci*/ 2780cabdff1aSopenharmony_ci#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 2781cabdff1aSopenharmony_ci{ \ 2782cabdff1aSopenharmony_ci uint32_t out0_m, out1_m, out2_m, out3_m; \ 2783cabdff1aSopenharmony_ci v16i8 tmp0_m, tmp1_m; \ 2784cabdff1aSopenharmony_ci \ 2785cabdff1aSopenharmony_ci PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 2786cabdff1aSopenharmony_ci \ 2787cabdff1aSopenharmony_ci out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \ 2788cabdff1aSopenharmony_ci out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \ 2789cabdff1aSopenharmony_ci out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \ 2790cabdff1aSopenharmony_ci out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \ 2791cabdff1aSopenharmony_ci \ 2792cabdff1aSopenharmony_ci SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ 2793cabdff1aSopenharmony_ci} 2794cabdff1aSopenharmony_ci 2795cabdff1aSopenharmony_ci/* Description : Pack even byte elements and store byte vector in destination 2796cabdff1aSopenharmony_ci memory 2797cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, pdst 2798cabdff1aSopenharmony_ci*/ 2799cabdff1aSopenharmony_ci#define PCKEV_ST_SB(in0, in1, pdst) \ 2800cabdff1aSopenharmony_ci{ \ 2801cabdff1aSopenharmony_ci v16i8 tmp_m; \ 2802cabdff1aSopenharmony_ci tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \ 2803cabdff1aSopenharmony_ci ST_SB(tmp_m, (pdst)); \ 2804cabdff1aSopenharmony_ci} 2805cabdff1aSopenharmony_ci 2806cabdff1aSopenharmony_ci/* Description : Horizontal 2 tap filter kernel code 2807cabdff1aSopenharmony_ci Arguments : Inputs - in0, in1, mask, coeff, shift 2808cabdff1aSopenharmony_ci*/ 2809cabdff1aSopenharmony_ci#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 2810cabdff1aSopenharmony_ci( { \ 2811cabdff1aSopenharmony_ci v16i8 tmp0_m; \ 2812cabdff1aSopenharmony_ci v8u16 tmp1_m; \ 2813cabdff1aSopenharmony_ci \ 2814cabdff1aSopenharmony_ci tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \ 2815cabdff1aSopenharmony_ci tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \ 2816cabdff1aSopenharmony_ci tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \ 2817cabdff1aSopenharmony_ci tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ 2818cabdff1aSopenharmony_ci \ 2819cabdff1aSopenharmony_ci tmp1_m; \ 2820cabdff1aSopenharmony_ci} ) 2821cabdff1aSopenharmony_ci#endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */ 2822